In [1]:
import pandas as pd
import ast
import nltk

# Load Data

In [2]:
def row_str_to_list(row):
    row['outreview'] = ' '.join(ast.literal_eval(row['outreview']))
    row['outtitle'] = ' '.join(ast.literal_eval(row['outtitle']))
    return row

In [3]:
data=pd.read_csv('data/multi/train.csv')
data = data.apply(row_str_to_list, axis=1)

In [4]:
data

Unnamed: 0,outtitle,outreview,label
0,wonderful dress,love dress think cut fabric embroidery flatter...,2
1,no brainer,dress truly no brainer purchase stunning dress...,2
2,nice tank,like look tank ran bit larger expected neck bu...,2
3,love,perfect end summer going fall soft pairs great...,2
4,vintage vibe builtin support,good dress hourglass figures runs bit narrow r...,2
5,cute comfy,really love soft nice quality complaint straps...,2
6,disappointed retailers descriptions lately,one proved no different short end dress pretty...,0
7,beautiful dress,beautiful dress delicately well made 51 95lb x...,2
8,swinging dress,thought dress store true size bit more swingin...,2
9,gorgeous dress,worse dress rehersal dinner received many comp...,2


# Compute Frequency

데이터 간 빈도 상위 단어 차이를 분석합니다.

In [5]:
def get_freqdist(series):
    """
    Get FreqDist from pandas Series
    """
    words = []
    
    for text in series:
        words.extend(text.split())
        
    return nltk.FreqDist(words)

In [6]:
def freqdist_to_table(freqdist):
    df = pd.DataFrame.from_dict(freqdist, orient='index')
    df.columns = ['frequency']
    df['percentage'] = df['frequency'] / df['frequency'].sum()
    return df.sort_values('frequency', ascending=False)[:10]

## 전체 상위 빈도 단어

In [7]:
total = get_freqdist(data['outreview']) + get_freqdist(data['outtitle'])
freqdist_to_table(total)

Unnamed: 0,frequency,percentage
but,12883,0.02351
dress,8768,0.016
not,7698,0.014048
love,7598,0.013865
size,6429,0.011732
top,6060,0.011059
fit,5666,0.01034
great,5599,0.010217
like,5242,0.009566
too,4703,0.008582


## 리뷰 유형별 상위 빈도 단어 비교

In [8]:
neg_df = data[data['label'] == 0]
neg = get_freqdist(neg_df['outtitle']) + get_freqdist(neg_df['outreview'])
freqdist_to_table(neg)

Unnamed: 0,frequency,percentage
but,1516,0.025927
not,1256,0.021481
dress,860,0.014708
like,802,0.013716
too,656,0.011219
top,617,0.010552
fit,586,0.010022
fabric,548,0.009372
would,512,0.008756
size,504,0.00862


In [9]:
neu_df = data[data['label'] == 1]
neu = get_freqdist(neu_df['outtitle']) + get_freqdist(neu_df['outreview'])
freqdist_to_table(neu)

Unnamed: 0,frequency,percentage
but,2338,0.031567
not,1513,0.020428
dress,1108,0.01496
too,999,0.013488
top,882,0.011909
like,869,0.011733
fit,861,0.011625
size,750,0.010126
would,673,0.009087
fabric,663,0.008952


In [10]:
pos_df = data[data['label'] == 2]
pos = get_freqdist(pos_df['outtitle']) + get_freqdist(pos_df['outreview'])
freqdist_to_table(pos)

Unnamed: 0,frequency,percentage
but,9029,0.021733
dress,6800,0.016368
love,6622,0.015939
size,5175,0.012456
great,4976,0.011977
not,4929,0.011864
top,4561,0.010978
fit,4219,0.010155
wear,3872,0.00932
like,3571,0.008595


## 제목 / 리뷰 별 상위 빈도 단어 비교

In [11]:
title = get_freqdist(data['outtitle'])
freqdist_to_table(title)

Unnamed: 0,frequency,percentage
love,1473,0.036302
great,1417,0.034922
dress,1331,0.032803
but,1234,0.030412
cute,1226,0.030215
beautiful,1102,0.027159
top,914,0.022526
not,817,0.020135
perfect,644,0.015871
pretty,532,0.013111


In [12]:
review = get_freqdist(data['outreview'])
freqdist_to_table(review)

Unnamed: 0,frequency,percentage
but,11649,0.022958
dress,7437,0.014657
not,6881,0.013561
size,6268,0.012353
love,6125,0.012071
fit,5184,0.010217
top,5146,0.010142
like,5041,0.009935
wear,4526,0.00892
too,4320,0.008514


## 테스트 데이터 별 상위 빈도 단어 비교

In [13]:
cosmetic_df = pd.read_csv('data/binary/cosmetic.csv').apply(row_str_to_list, axis=1)
cosmetic = get_freqdist(cosmetic_df['outtitle']) + get_freqdist(cosmetic_df['outreview'])
freqdist_to_table(cosmetic)

Unnamed: 0,frequency,percentage
sephora,272,0.019004
but,189,0.013205
makeup,166,0.011598
store,147,0.01027
not,147,0.01027
like,127,0.008873
one,106,0.007406
love,101,0.007057
help,92,0.006428
dont,90,0.006288


In [14]:
food_df = pd.read_csv('data/binary/food.csv').apply(row_str_to_list, axis=1)
food = get_freqdist(food_df['outtitle']) + get_freqdist(food_df['outreview'])
freqdist_to_table(food)

Unnamed: 0,frequency,percentage
karaoke,551,0.018615
but,418,0.014122
food,339,0.011453
room,316,0.010676
place,287,0.009696
good,278,0.009392
not,252,0.008514
sushi,244,0.008244
rooms,236,0.007973
yamasho,201,0.006791


In [15]:
man_df = pd.read_csv('data/binary/man.csv').apply(row_str_to_list, axis=1)
man = get_freqdist(man_df['outtitle']) + get_freqdist(man_df['outreview'])
freqdist_to_table(man)

Unnamed: 0,frequency,percentage
shirt,692,0.033126
shirts,586,0.028052
fit,398,0.019052
proper,292,0.013978
cloth,289,0.013834
great,267,0.012781
but,193,0.009239
service,190,0.009095
get,186,0.008904
not,171,0.008186


In [16]:
woman_df = pd.read_csv('data/binary/woman.csv').apply(row_str_to_list, axis=1)
woman = get_freqdist(woman_df['outtitle']) + get_freqdist(woman_df['outreview'])
freqdist_to_table(woman)

Unnamed: 0,frequency,percentage
but,182,0.01642
store,158,0.014255
primark,117,0.010556
clothes,105,0.009473
not,101,0.009112
quality,96,0.008661
like,83,0.007488
place,77,0.006947
cheap,74,0.006676
great,70,0.006315
