In [3]:
from collections import Counter
import pandas as pd

df = pd.read_csv('sbike_google_sept.csv')

reviews = df['review_text']

In [4]:
len(reviews)

1938

#### Preprocessing

In [56]:
from ckonlpy.tag import Postprocessor #전처리 라이브러리
from ckonlpy.tag import Twitter
import warnings
warnings.simplefilter("ignore")

ctwitter = Twitter()
ctwitter.add_dictionary('해주시면','Verb')
#ctwitter.add_dictionary(['타고','요청드립니다','부탁드립니다', '요청 드립니다','부탁 드립니다','해','주시면','해주시면'],'Verb')
ctwitter.add_dictionary(['따릉이','자출',
                       '대여소','연결거치','거치대','고객센터',
                        '다시','퇴근시간','대여시간',
                        '출근시간','한시간','두시간','임시폐쇄',
                         '자동로그인',
                        ], 'Noun')
#조사
ctwitter.add_dictionary(['까지','하냐'], 'Josa')
#형용사
ctwitter.add_dictionary(['레알','있어요'], 'Adjective')
#부사

ctwitter.add_dictionary(['왜', '좀','진짜','최악'], 'Adverb', force=True)

ctwitter.add_dictionary(['타고','요청드립니다','부탁드립니다',
                         '요청 드립니다','부탁 드립니다','하기'],'Verb')

#무의미한 형태소 및 단어 제거
stopwords = {'수가','까지','드립','번','수','것', '앞','해주', '어요',
             '곳','떄','좀','요','이','감사','이','그','고','제','쪽','더','안','후','시','거','전',
'려고','적','저','데','등','역','중','해주시','내','면서','어서','때','뭐','못','마다','더니','해도','해도','다가','어가','개','하라','하나','걸'}
postprocessor = Postprocessor(ctwitter, stopwords = stopwords)


postprocessor.pos('되있어요')

[('되있', 'Verb')]

## Grouping 1~2, 3~5

In [15]:
df['sentimental'] = df['rating'].apply(lambda x: "good" if x>=3 else "bad"  )
df.head()



Unnamed: 0.1,Unnamed: 0,user_name,date,rating,thumbs_up,review_text,sentimental
0,0,hayeon,2019년 9월 29일,1,11,뭐 누르기만 해도 앱이 계속 로그아웃되고 네이버로그인을 해봤자 어차피 여기 가입을 ...,bad
1,1,신아현,2019년 9월 23일,2,4,1. 로그인 화면에서 자동 로그인 체크하고 재로그인하면 이상한 비번이 저장되서 자동...,bad
2,2,Luis sy park,2019년 9월 29일,5,1,너무 좋아요 . 정말 유용하게 잘 사용 하고있음 그런데 많은 자전거가 다 어디로 사...,good
3,3,임제나,2019년 9월 28일,1,4,어플 잘 못 만들겠으면 외주를 주시죠 따릉이 단점 1. 로그인 불편하게해놓음 매번 ...,bad
4,4,SK A,2019년 10월 6일,1,0,이거 쓰시려는 분 혹시 회원가입시 휴대폰 인증번호 안날아오는 오류 있으시면 와이파이...,bad


In [18]:
good_group = df[df['sentimental']=='good']
bad_group = df[df['sentimental']=='bad']



In [57]:
good_sentences_tag = []
for sentence in good_group['review_text']:
    morph = postprocessor.pos(sentence)
    good_sentences_tag.append(morph)

In [58]:
good_adj_list = []
for sentence in good_sentences_tag:
    for word, tag in sentence:
        if tag in ['Adjective']:
            
            good_adj_list.append(word)
            
counts_good_adj = Counter(good_adj_list)
good_df = pd.DataFrame(counts_good_adj.most_common(100))
good_df =good_df.rename(columns={0:'words', 1:'count'})

In [67]:
good_noun_list = []
for sentence in good_sentences_tag:
    for word, tag in sentence:
        if tag in ['Noun']:
            
            good_noun_list.append(word)
            
counts_noun_adj = Counter(good_noun_list)
noun_df = pd.DataFrame(counts_noun_adj.most_common(100))
noun_df =noun_df.rename(columns={0:'words', 1:'count'})

good_df_concat = pd.concat([noun_df,good_df] , axis=1)
good_df_concat.to_excel('Yoon/good_google.xlsx')

## Bad Scores

In [69]:
bad_sentences_tag = []
for sentence in bad_group['review_text']:
    morph = postprocessor.pos(sentence)
    bad_sentences_tag.append(morph)

In [71]:
bad_adj_list = []
for sentence in bad_sentences_tag:
    for word, tag in sentence:
        if tag in ['Adjective']:
            
            bad_adj_list.append(word)
            
counts_bad_adj = Counter(bad_adj_list)
bad_df = pd.DataFrame(counts_bad_adj.most_common(100))
bad_df =bad_df.rename(columns={0:'words', 1:'count'})

In [72]:
bad_noun_list = []
for sentence in bad_sentences_tag:
    for word, tag in sentence:
        if tag in ['Noun']:
            
            bad_noun_list.append(word)
            
counts_noun_adj = Counter(bad_noun_list)
noun_df = pd.DataFrame(counts_noun_adj.most_common(100))
noun_df =noun_df.rename(columns={0:'words', 1:'count'})

bad_df_concat = pd.concat([noun_df,bad_df] , axis=1)
bad_df_concat.to_excel('Yoon/bad_google.xlsx')

In [73]:
bad_df_concat

Unnamed: 0,words,count,words.1,count.1
0,앱,697,안되고,114
1,어플,315,어떻게,80
2,결제,278,입니다,69
3,로그인,265,안됨,68
4,자전거,238,이런,45
5,지도,219,같은,42
6,왜,207,있,42
7,대여,201,안되서,37
8,오류,193,있는,37
9,따릉이,192,좋은데,36


#### 형태소 분석을 진행함

In [25]:
sentences_tag = []
for sentence in reviews:
    morph = ctwitter.pos(sentence)
    

    sentences_tag.append(morph)
    


##### 형용사 추출

In [16]:
adj_list = []
for sentence in sentences_tag:
    for word, tag in sentence:
        if tag in ['Adjective']:
            
            adj_list.append(word)
            
counts_good_adj = Counter(verb_list)
print(counts_adj.most_common(100))

2993

In [27]:
verb_list = []
for sentence in sentences_tag:
    for word, tag in sentence:
        if tag in ['Verb']:
            
            verb_list.append(word)
            
len(adj_list)
counts_adj = Counter(verb_list)
print(counts_adj.most_common(100))

[('할', 191), ('잘', 154), ('들', 119), ('건지', 115), ('하는', 115), ('합니다', 107), ('했는데', 96), ('하면', 78), ('하게', 70), ('되고', 68), ('하기', 68), ('만든', 65), ('뜨고', 62), ('하는데', 59), ('가', 52), ('넘', 50), ('않고', 48), ('하네요', 48), ('참', 48), ('까', 46), ('쓰', 44), ('만들', 44), ('쓸', 42), ('했', 42), ('된', 40), ('서', 39), ('한', 39), ('들이', 38), ('하지', 37), ('되', 35), ('되는', 33), ('해야', 32), ('되는데', 32), ('하세요', 32), ('되지', 31), ('됨', 28), ('대', 27), ('되서', 26), ('떠서', 26), ('만들어', 25), ('지', 25), ('써', 23), ('누르면', 22), ('않아', 22), ('빌릴', 22), ('않음', 21), ('고쳐주세요', 21), ('나서', 21), ('돼요', 20), ('했습니다', 20), ('는', 19), ('됩니다', 19), ('들은', 19), ('되면', 18), ('오고', 18), ('돼서', 18), ('뜨', 18), ('받고', 18), ('하네', 17), ('않네요', 17), ('나오고', 17), ('되네요', 17), ('쳐', 17), ('쓰고', 17), ('째', 16), ('드려요', 16), ('빌리는데', 16), ('하다', 16), ('걸리고', 16), ('만드는', 16), ('찾', 16), ('깔', 15), ('내고', 15), ('하려', 15), ('할수', 15), ('하는거', 15), ('눌러도', 14), ('걸림', 14), ('날리고', 14), ('할거면', 14), ('쓰는', 14), ('와', 14), ('먹고', 1

In [17]:
counts_adj = Counter(adj_list)
print(counts_adj.most_common(100))

[('안되고', 116), ('어떻게', 87), ('입니다', 86), ('있', 77), ('안됨', 71), ('좋아요', 57), ('이런', 53), ('있는', 50), ('좋은데', 48), ('같은', 43), ('좋은', 43), ('없', 43), ('안되서', 39), ('없는', 37), ('없고', 37), ('같아요', 35), ('안되네요', 35), ('없음', 34), ('없네요', 34), ('많고', 34), ('같습니다', 31), ('새로', 29), ('있습니다', 29), ('아닌', 27), ('느리고', 27), ('있는데', 26), ('느려', 25), ('많은', 22), ('안되요', 22), ('안되는', 22), ('좋', 21), ('없다', 20), ('있는지', 20), ('좋겠습니다', 19), ('안됩니다', 17), ('없다고', 16), ('좋겠', 15), ('있고', 14), ('아깝다', 14), ('같네요', 14), ('있게', 14), ('어떤', 14), ('있음', 13), ('굉장히', 13), ('있으면', 13), ('느려요', 13), ('이럴거면', 13), ('이런거', 12), ('많음', 12), ('아까', 12), ('안된다', 12), ('있다', 12), ('좋겠네요', 12), ('없습니다', 11), ('많', 11), ('없네', 11), ('느려서', 11), ('많아요', 10), ('같음', 10), ('없어', 10), ('안되는데', 10), ('많습니다', 10), ('같다', 9), ('좋습니다', 9), ('있다고', 9), ('안되나요', 9), ('없는데', 9), ('안되', 9), ('좋으나', 9), ('같', 9), ('많아서', 9), ('같은데', 9), ('없게', 9), ('있는거', 8), ('없으면', 8), ('어렵고', 8), ('빠른', 8), ('안되면', 8), ('편하게', 8), ('있네요', 8), ('

In [18]:
import pandas as pd
df = pd.DataFrame(counts_adj.most_common(100))
df.to_excel('Yoon/s_bike_google_adj.xlsx')

#### 명사 추출후 상위 10개 항목 정렬

In [19]:
noun_list = []
for sentence in sentences_tag:
    for word, tag in sentence:
        if tag in ['Noun']:
            noun_list.append(word)
n_counts = Counter(noun_list)

print(n_counts.most_common(30))

[('앱', 785), ('어플', 338), ('결제', 329), ('로그인', 317), ('자전거', 316), ('따릉이', 275), ('지도', 245), ('대여', 234), ('좀', 227), ('왜', 221), ('오류', 216), ('사용', 208), ('업데이트', 200), ('진짜', 192), ('이용', 185), ('요', 179), ('불편', 169), ('안', 168), ('최악', 165), ('대여소', 160), ('시간', 158), ('어요', 156), ('거', 154), ('때', 152), ('다시', 148), ('로딩', 129), ('뭐', 126), ('회원', 125), ('돈', 121), ('이', 119)]


In [22]:
import pandas as pd

df = pd.DataFrame.from_dict(n_counts, orient='index').reset_index()
df = df.rename(columns={'index':'noun', 0:'count'})
df=df.sort_values(by=['count'], axis=0, ascending=False)


In [23]:
#엑셀로 저장
df.to_excel("Yoon/sbike_google_noun.xlsx")

In [18]:
import pandas as pd
df2 = pd.DataFrame.from_dict(counts_adj, orient='index').reset_index()
df2 = df2.rename(columns={'index':'noun', 0:'count'})
df2 = df2.sort_values(by=['count'], axis=0, ascending=False)
df2.head()
df2.to_excel('./google_adj_count.xlsx')