# Load Data

In [1]:
import pandas as pd
import ast

In [2]:
def row_str_to_list(row):
    row['outreview'] = ' '.join(ast.literal_eval(row['outreview']))
    row['outtitle'] = ' '.join(ast.literal_eval(row['outtitle']))
    return row

In [3]:
data=pd.read_csv('data/multi/train.csv')
data = data.apply(row_str_to_list, axis=1)

In [4]:
data

Unnamed: 0,outtitle,outreview,label
0,wonderful dress,love dress think cut fabric embroidery flatter...,2
1,no brainer,dress truly no brainer purchase stunning dress...,2
2,nice tank,like look tank ran bit larger expected neck bu...,2
3,love,perfect end summer going fall soft pairs great...,2
4,vintage vibe builtin support,good dress hourglass figures runs bit narrow r...,2
5,cute comfy,really love soft nice quality complaint straps...,2
6,disappointed retailers descriptions lately,one proved no different short end dress pretty...,0
7,beautiful dress,beautiful dress delicately well made 51 95lb x...,2
8,swinging dress,thought dress store true size bit more swingin...,2
9,gorgeous dress,worse dress rehersal dinner received many comp...,2


# Compute Frequency

리뷰의 유형(긍정, 부정, 중립) 및 제목과 본문 간의 빈도 상위 단어 차이를 분석합니다.

In [5]:
matrixdata=data.as_matrix()

In [6]:
pos_title=[]
pos_review=[]
neg_title=[]
neg_review=[]
neu_title=[]
neu_review=[]
for i in matrixdata:
    title = i[0]
    review = i[1]
    label = i[2]
    
    if label == 2:
        pos_title.extend(title.split())
        pos_review.extend(review.split())
    elif label == 1:
        neu_title.extend(title.split())
        neu_review.extend(review.split())
    else:
        neg_title.extend(title.split())
        neg_review.extend(review.split())

In [7]:
import nltk

In [8]:
post=nltk.FreqDist(i for i in pos_title)
posr=nltk.FreqDist(i for i in pos_review)
negt=nltk.FreqDist(i for i in neg_title)
negr=nltk.FreqDist(i for i in neg_review)
neut=nltk.FreqDist(i for i in neu_title)
neur=nltk.FreqDist(i for i in neu_review)

In [31]:
def freqdist_to_table(freqdist):
    df = pd.DataFrame.from_dict(freqdist, orient='index')
    df.columns = ['frequency']
    return df.sort_values('frequency', ascending=False)[:10]

## 전체 상위 빈도 단어

In [35]:
total = post + posr + negt + negr + neut + neur
freqdist_to_table(total)

Unnamed: 0,frequency
but,12883
dress,8768
not,7698
love,7598
size,6429
top,6060
fit,5666
great,5599
like,5242
too,4703


## 리뷰 유형별 상위 빈도 단어 비교

In [9]:
pos = post + posr
neg = negt + negr
neu = neut + neur

In [32]:
freqdist_to_table(pos)

Unnamed: 0,frequency
but,9029
dress,6800
love,6622
size,5175
great,4976
not,4929
top,4561
fit,4219
wear,3872
like,3571


In [33]:
freqdist_to_table(neg)

Unnamed: 0,frequency
but,1516
not,1256
dress,860
like,802
too,656
top,617
fit,586
fabric,548
would,512
size,504


In [34]:
freqdist_to_table(neu)

Unnamed: 0,frequency
but,2338
not,1513
dress,1108
too,999
top,882
like,869
fit,861
size,750
would,673
fabric,663


## 제목 / 리뷰 별 상위 빈도 단어 비교

In [36]:
title = post + negt + neut
review = posr + negr + neur

In [37]:
freqdist_to_table(title)

Unnamed: 0,frequency
love,1473
great,1417
dress,1331
but,1234
cute,1226
beautiful,1102
top,914
not,817
perfect,644
pretty,532


In [38]:
freqdist_to_table(review)

Unnamed: 0,frequency
but,11649
dress,7437
not,6881
size,6268
love,6125
fit,5184
top,5146
like,5041
wear,4526
too,4320
