## 1. 개발 환경 설정

### 1-0. 구글드라이브 연결(Colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### 1-1. 라이브러리 설치

In [None]:
!apt-get update
!pip install konlpy pandas seaborn gensim wordcloud python-mecab-ko wget
!pip install koreanize-matplotlib

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

### 1-2. 라이브러리 import

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import koreanize_matplotlib

from IPython.display import display
from wordcloud import WordCloud

## 2.데이터 불러오기

In [None]:
df = pd.read_csv('/content/drive/MyDrive/text_dataset/train.csv')

In [None]:
df.shape

In [None]:
df.tail(2)

## 3. 데이터 확인, 분석하기

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

In [None]:
plt.bar(df['label'].value_counts().index, df['label'].value_counts())
plt.show()

In [None]:
df.head()
text = list(df['text'])
tokenized_text = [t.split() for t in text]
text_len_by_token = [len(tokens) for tokens in tokenized_text]

In [None]:
print('문장 최대 길이: ', np.max(text_len_by_token))
print('문장 최소 길이: ', np.min(text_len_by_token))
print('문장 평균 길이: ', np.mean(text_len_by_token))
print('문장 길이 표준편차: ', np.std(text_len_by_token))
print('문장 중간 길이: ', np.median(text_len_by_token))
print('제1사분위 길이: ', np.percentile(text_len_by_token, 25))
print('제3사분위 길이: ', np.percentile(text_len_by_token, 75))

### 3-1. 문의 내용 길이 분포

* 문의 내용의 길이 분포를 확인합니다.
* 최소, 최대 길이의 index 및 문의 내용을 확인합니다.

In [None]:
df['text'].str.len()

In [None]:
df['text'].str.len().value_counts()

In [None]:
plt.bar(df['text'].str.len().value_counts().index, df['text'].str.len().value_counts())
plt.show()

### 3-2. 단어 등장 빈도 시각화

In [None]:
one_line_text = " ".join(list(df['text']))

In [None]:
from collections import Counter
from konlpy.tag import Okt

okt = Okt()

del_list = ['를', '이', '은', '는', '있다', '하다', '에']  
words = [ word for word in okt.morphs(one_line_text) if (len(word[0]) != 1) & (word[0] not in del_list)]

words_count = Counter(words)
word_dic = dict(words_count)

sorted_word_dic = sorted(word_dic.items(), key=lambda x:x[1], reverse=True)

In [None]:
import nltk

word_freq = nltk.FreqDist(sorted_word_dic)
df = pd.DataFrame(list(word_freq.values()), word_freq.keys())

result = df.sort_values([0], ascending = False)
result = result[:50]
result.plot(kind='bar', legend=False, figsize=(15,5))
plt.show()

In [None]:
from nltk import Text

kolaw = Text(okt.nouns(one_line_text), name="kolaw")
kolaw.plot(30)
plt.show()

### 3-3. 워드클라우드

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(max_font_size=100, 
                      # max_words=55, 
                      stopwords = STOPWORDS, 
                      background_color = 'black', 
                      # font_path=FONT_PATH,
                      width = 800, height = 600).generate(one_line_text)

plt.figure(figsize = (15, 10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()