In [1]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Total 91 (delta 0), reused 0 (delta 0), pack-reused 91[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 26.1 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 29.7 MB/s 
[?25hCollecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.1 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautif

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import networkx as nx
from matplotlib import rc
from collections import Counter
from konlpy.tag import Mecab
from wordcloud import WordCloud
from gensim.models import Word2Vec



In [None]:
df_france = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/naver_movie_reviews_fr.csv')
df_korea = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/naver_movie_reviews_KR.csv')

mecab = Mecab()
stopwords = ['영화', '점', '최고', '평점', '거', '것', '돈', '수', '게', '말', '최고', '최악', '쓰레기', '년', '듯', '나', '똥', '때', '마지막', '내', '만', '번', '개', '끝', '사람', '내용', '시간', '재미', '생각', '이건', '사랑', '식', '코미디']


def tokenize(df):
  df['tokenized'] = df['comment'].apply(mecab.nouns)
  df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])


def pre_processing(df):
    _df = df.copy()
    _df['label'] = np.select([_df.score <= 3 , _df.score > 6], [0, 1], default=2)
    _df.drop_duplicates(subset=['comment'], inplace=True)  # 중복 제거
    _df['comment'] = _df['comment'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')  # 한글만
    _df['comment'] = _df['comment'].str.replace('^ +', '')
    _df['comment'].replace('', np.nan, inplace=True)
    _df.dropna(subset=['comment'], inplace=True)

    tokenize(_df)
    
    return _df



df_france, df_korea = map(pre_processing, [df_france, df_korea])

positive_words_fr = df_france[df_france.label == 1]['tokenized'].values
negative_words_fr = df_france[df_france.label == 0]['tokenized'].values

positive_word_count_fr = Counter(np.hstack(positive_words_fr))
negative_word_count_fr = Counter(np.hstack(negative_words_fr))

positive_words_kr = df_korea[df_korea.label == 1]['tokenized'].values
negative_words_kr = df_korea[df_korea.label == 0]['tokenized'].values

positive_word_count_kr = Counter(np.hstack(positive_words_kr))
negative_word_count_kr = Counter(np.hstack(negative_words_kr))

print(positive_word_count_fr)
print(negative_word_count_fr)
print(positive_word_count_kr)
print(negative_word_count_kr)

In [None]:
positive_words_fr_10 = list(map(lambda x: x[0], positive_word_count_fr.most_common(10)))
negative_words_fr_10 = list(map(lambda x: x[0], negative_word_count_fr.most_common(10)))
positive_words_kr_10 = list(map(lambda x: x[0], positive_word_count_kr.most_common(10)))
negative_words_kr_10 = list(map(lambda x: x[0], negative_word_count_kr.most_common(10)))

top_10_words = np.concatenate((positive_words_fr_10, negative_words_fr_10, positive_words_kr_10, negative_words_kr_10))


In [None]:
rc('font', family='NanumBarunGothic')

In [None]:
def get_related_words(tokens_list, center_word):
  arr = []
  for tokens in tokens_list:
    if center_word in tokens:      
      arr.append([word for word in tokens if word not in top_10_words or word == center_word])
  
  co_frequency_list = [[center_word, wordAndCount[0], wordAndCount[1]]  for wordAndCount in Counter(np.hstack(arr)).most_common(10)]
  co_frequency_df = pd.DataFrame(co_frequency_list, columns=['중심단어', '연관단어', '빈도'])
  co_frequency_df = co_frequency_df.sort_values(by=['빈도'], ascending=False)
  co_frequency_df = co_frequency_df.reset_index(drop=True)

  return co_frequency_df


def draw_nx_visualization(df):
  G = nx.Graph()

  for i in range(len(df)):
    G.add_node(df['연관단어'][i], nodesize=df['빈도'][i])

  for i in range(len(df)):
    G.add_edge(df['중심단어'][i], df['연관단어'][i])

  sizes = [f * 20 for f in df['빈도']]
  sizes[0] = sizes[0] / 10

  options = {
      'with_labels': True,
      'node_color': '#FFFF00'
  }
  
  nx.draw(G, node_size=sizes, pos=nx.spring_layout(G, k=3.5, iterations=100), **options, font_family='NanumBarunGothic')
  ax = plt.gca()
  plt.show()


In [None]:
df_related_words_with = get_related_words(positive_words_fr, '액션')
print(df_related_words_with)
draw_nx_visualization(df_related_words_with)

In [None]:
df_related_words_with = get_related_words(positive_words_fr, '프랑스')
print(df_related_words_with)
draw_nx_visualization(df_related_words_with)

In [None]:
df_related_words_with = get_related_words(positive_words_fr, '음악')
print(df_related_words_with)
draw_nx_visualization(df_related_words_with)

In [None]:
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf