## Preprocessing with NSMC

* For sentiment analysis, we use the Naver Movie Corpus (https://github.com/e9t/nsmc/)

In [1]:
def read_documents(filename):
    # NSMC import function
    with open(filename, encoding="utf-8") as f:
        documents = [line.split('\t')for line in f.read().splitlines()]
        documents = documents[1:]
    return documents
    
train_docs = read_documents("/Users/shim/dl-python-SentimentAnalysis/data/ratings_train.txt")
test_docs = read_documents("/Users/shim/dl-python-SentimentAnalysis/data/ratings_test.txt")

In [2]:
print(len(train_docs))
print(len(test_docs))

150000
50000


### Function definitions

In [5]:
from konlpy.tag import Mecab
from konlpy.tag import Okt
import json
import os
import re
from pprint import pprint

In [7]:
def text_cleaning(doc):
    # A function to remove letters except Korean.
    doc = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", doc)
    return doc

def define_stopwords(path):
    
    SW = set()
    # How to add a boolean 1.
    # SW.add("there is")
    
    # How to add a stopword 2.
    # Add directly to stopwords-en.txt
    with open(path) as f:
        for word in f:
            SW.add(word)
            
    return SW

def text_tokenizing(doc):
    return [word for word in mecab.morphs(doc) if word not in SW and len(word) > 1]

### Tokenizes the imported data with part-of-speech tags.

In [8]:
# Load the stemmer
okt = Okt()
mecab = Mecab()

SW = define_stopwords("/Users/shim/dl-python-SentimentAnalysis/Konlpy/stopwords-ko.txt")

# Since there are 200,000 texts, it takes time to clean them.
# Let's organize the code so that once it's created, 
# it can be loaded when the code is run again without having to refine it again.




### Analyze the histogram using NLTK

* Check basic information to analyze the data.

* Perform preprocessing using the NLTK library.

In [None]:
import nltk

total_tokens = [token for doc in train_data for token in doc[0]]
print(len(total_tokens))

In [None]:
text = nltk.Text(total_tokens, name='NMSC')
print(len(set(text.tokens)))
pprint(text.vocab().most_common(10))

### Histogram 그리기.

In [None]:
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc
%matplotlib inline

path = "c:/Windows/Fonts/malgun.ttf"
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~')

plt.figure(figsize=(16, 10))
text.plot(50)

### WordCloud 그리기.

In [None]:
# wordcloud 라이브러리를 설치합니다.


In [None]:
from wordcloud import WordCloud

data = text.vocab().most_common(50)

# for windows : font_path='c:/Windows/Fonts/malgun.ttf'
wordcloud = WordCloud(font_path='/Library/Fonts/AppleGothic.ttf',
                      relative_scaling = 0.2,
                      #stopwords=STOPWORDS,
                      background_color='white',
                      ).generate_from_frequencies(dict(data))
plt.figure(figsize=(16,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()