# 트위터 크롤링

In [1]:
from twitter_api import BEARER_TOKEN
from collections import Counter

import nltk
import tweepy
import pandas as pd
from bs4 import BeautifulSoup
import requests
from konlpy.tag import Okt

In [2]:
client = tweepy.Client(BEARER_TOKEN)

In [13]:
# 크롤링 하기
query = "#endangered -is:retweet lang:en"
tweets_en = tweepy.Paginator(
    client.search_recent_tweets, query=query, tweet_fields=["context_annotations", "created_at"], max_results=100
).flatten(limit=1000)

# 파일에 쓰기
f = open("tweet_en.txt", 'w', encoding="utf-8")
for tweet in tweets_en:
    f.write(tweet.text)
f.close()

In [7]:
query = "#멸종 -is:retweet lang:ko"
tweets_ko = tweepy.Paginator(
    client.search_recent_tweets, query=query, tweet_fields=["context_annotations", "created_at"], max_results=100
).flatten(limit=1000)

# 파일에 쓰기
f = open("tweet_ko.txt", 'w', encoding="utf-8")
for tweet in tweets_ko:
    f.write(tweet.text)
f.close()

In [8]:
client = tweepy.Client(BEARER_TOKEN)

query = "#animalwelfare -is:retweet lang:en"
tweets_ko = tweepy.Paginator(
    client.search_recent_tweets, query=query, tweet_fields=["context_annotations", "created_at"], max_results=100
).flatten(limit=1000)

# 파일에 쓰기
f = open("병준.txt", 'w', encoding="utf-8")
for tweet in tweets_ko:
    f.write(tweet.text)
f.close()



# 형태소 분석

In [67]:
# 모듈 다운
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [15]:
# 파일 읽기
with open("병준.txt", "r", encoding="utf-8") as f:
    data = f.read()

In [16]:
# 토큰화
data = data.lower()
token = nltk.word_tokenize(data)

# 불용어 처리
import string
stopwords = nltk.corpus.stopwords.words('english') + list(string.punctuation)
token = [t for t in token if t.lower() not in stopwords]

# 형태소 태깅
tag = nltk.tag.pos_tag(token)
word_data = []
for word, tag in tag:
    if tag in ["NN", "NNS", "NNPS", "VB", "VBD", "VBG", "VBN", 'VBZ', "JJ", 'JJR', 'JJS']:
        word_data.append(word)

In [17]:
# 숫자 세기
word_data = Counter(word_data)
make_word_cloud = word_data.most_common(70)

In [18]:
make_csv = pd.DataFrame(make_word_cloud)
make_csv.to_csv("병준.csv")

# 한국 트윗 형태소 분석

In [3]:
# 형태소 테깅
pos = open("tweet_ko.txt", encoding='utf-8').read()
okt = Okt()

sentences_tag = []
sentences_tag = okt.pos(pos)

noun_adj_list = []
for word, tag in sentences_tag:
    if tag in ["Noun", "Adjective"]:
        noun_adj_list.append(word)

counts = Counter(noun_adj_list)
tags = counts.most_common(70)

make_csv = pd.DataFrame(tags)
make_csv.to_csv("tweet_ko_word.csv")

# 한국 뉴스 크롤링

In [2]:
class NaverNews:
    url = 'https://search.naver.com/search.naver?where=news&sm=tab_pge'

    def __init__(self, search, sort, name=''):
        search = '&query=' + str(search)
        sort = '&sort=' + str(sort)
        self.search = search
        self.sort = sort
        self.name = name
        self.search_url = NaverNews.url + search + sort

    
    def find_tum_text(self, maxpage, tumClass = 'api_txt_lines dsc_txt_wrap'):

        result_list = []

        for num in range(1, maxpage+1):
            page_num = '&start=' + str(10*num-9)
            full_url = self.search_url + page_num 
            req = requests.get(full_url)
            bs = BeautifulSoup(req.text, "html.parser")

            tums = bs.find_all('a', {'class': tumClass})

            for tum in tums:
                result_list.append(tum.text)
        
        return result_list

In [3]:
def get_html(url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None

        return BeautifulSoup(req.text, "html.parser")

In [4]:
def daum_web_find(search, maxpage=1):  
    url = 'https://search.daum.net/search?nil_suggest=btn&w=fusion&DA=SBC&q=' + search
    result = []

    for i in range(1,maxpage+1):
        url2 = url + '&p=' + str(i)
        html = get_html(url2)

        a = html.find_all('p',{'class': 'desc'})
        for i in a:
            result.append(i.text)    
    
    return result

In [5]:
def to_txt(list, filename):
    with open(filename,'w',encoding='UTF-8') as f:
        for txt in list:
            f.write(txt+'\n')

In [6]:
result = NaverNews('동물+등록제', 0)
result = result.find_tum_text(100)
to_txt(result, "병준2.txt")

In [7]:
# 형태소 테깅
pos = open("병준2.txt", encoding='utf-8').read()
okt = Okt()

sentences_tag = []
sentences_tag = okt.pos(pos)

noun_adj_list = []
for word, tag in sentences_tag:
    if tag in ["Noun", "Adjective"]:
        noun_adj_list.append(word)

counts = Counter(noun_adj_list)
tags = counts.most_common(70)

make_csv = pd.DataFrame(tags)
make_csv.to_csv("병준2.csv")