In [None]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
from konlpy.tag import Mecab
from wordcloud import WordCloud
from gensim.models import Word2Vec



In [4]:
df_france = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/naver_movie_reviews_fr.csv')
df_korea = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/naver_movie_reviews_KR.csv')

mecab = Mecab()
stopwords = ['영화', '점', '최고', '평점', '거', '것', '돈', '수', '게', '말', '최고', '최악', '쓰레기', '년', '듯', '나', '똥', '때', '마지막', '내', '만', '번', '개', '끝', '사람', '내용', '시간', '재미', '생각', '이건', '사랑']


def tokenize(df):
  df['tokenized'] = df['comment'].apply(mecab.nouns)
  df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])


def pre_processing(df):
    _df = df.copy()
    _df['label'] = np.select([_df.score <= 3 , _df.score > 6], [0, 1], default=2)
    _df.drop_duplicates(subset=['comment'], inplace=True)  # 중복 제거
    _df['comment'] = _df['comment'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '')  # 한글만
    _df['comment'] = _df['comment'].str.replace('^ +', '')
    _df['comment'].replace('', np.nan, inplace=True)
    _df.dropna(subset=['comment'], inplace=True)

    tokenize(_df)
    
    return _df



df_france, df_korea = map(pre_processing, [df_france, df_korea])

positive_words_fr = df_france[df_france.label == 1]['tokenized'].values
negative_words_fr = df_france[df_france.label == 0]['tokenized'].values

positive_word_count_fr = Counter(np.hstack(positive_words_fr))
negative_word_count_fr = Counter(np.hstack(negative_words_fr))

positive_words_kr = df_korea[df_korea.label == 1]['tokenized'].values
negative_words_kr = df_korea[df_korea.label == 0]['tokenized'].values

positive_word_count_kr = Counter(np.hstack(positive_words_kr))
negative_word_count_kr = Counter(np.hstack(negative_words_kr))

print(positive_word_count_fr)
print(negative_word_count_fr)
print(positive_word_count_kr)
print(negative_word_count_kr)

Counter({'스토리': 1033, '액션': 804, '에이즈': 644, '이해': 616, '감독': 604, '기대': 603, '생각': 599, '이건': 588, '이게': 554, '여자': 521, '처음': 483, '뭐': 469, '건': 468, '이거': 458, '실망': 457, '동성애': 454, '정도': 449, '하나': 447, '프랑스': 446, '줄': 436, '바': 432, '중': 409, '분': 393, '배우': 383, '충': 377, '사랑': 376, '걸': 372, '감동': 368, '주인공': 361, '편': 358, '짜증': 346, '잼': 335, '이상': 325, '전개': 311, '장면': 310, '게이': 309, '반전': 309, '남자': 298, '때문': 292, '애': 290, '중간': 277, '이야기': 266, '뿐': 265, '결말': 258, '미': 250, '데': 249, '수준': 240, '건지': 239, '연기': 235, '난': 233, '소재': 229, '느낌': 226, '이': 222, '기분': 220, '개연': 211, '정신': 210, '노': 205, '친구': 200, '일': 194, '지': 192, '예술': 192, '작품': 188, '치료': 185, '남': 185, '테러': 184, '자체': 182, '영상': 181, '레옹': 181, '연출': 180, '예고편': 178, '뻔': 177, '새끼': 175, '인생': 174, '완전': 172, '점수': 171, '동성애자': 164, '어디': 164, '우리': 163, 'ㅅ': 163, '이걸': 162, '후회': 162, '뭔가': 159, '시리즈': 159, '추천': 158, '뭘': 157, '공포': 157, '인간': 156, '눈': 156, '니': 155, '후': 155, '나라': 154, '진심':

['생각', '이건', '']

In [38]:
positive_words_fr_10 = list(map(lambda x: x[0], positive_word_count_fr.most_common(10)))
negative_words_fr_10 = list(map(lambda x: x[0], negative_word_count_fr.most_common(10)))
positive_words_kr_10 = list(map(lambda x: x[0], positive_word_count_kr.most_common(10)))
negative_words_kr_10 = list(map(lambda x: x[0], negative_word_count_kr.most_common(10)))

top_10_words = np.concatenate((positive_words_fr_10, negative_words_fr_10, positive_words_kr_10, negative_words_kr_10))

def get_related_words(center_word):
  arr = []
  
  for tokens in positive_words_fr:
    if center_word in tokens:
      arr.append([word for word in tokens if word not in top_10_words])
  
  top = Counter(np.hstack(arr)).most_common(10)


print(top)

[('특유', 207), ('식', 183), ('매력', 174), ('느낌', 142), ('중', 136), ('코미디', 133), ('유머', 116), ('이야기', 115), ('배경', 93), ('감성', 85)]


In [7]:
G_centrality = nx.Graph()
