# 03-2. Sentiment Analysis NPL
> 감성 분석 모델 구축

## 환경 설정

konlpy 설치

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip install konlpy

라이브러리

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt
plt.rc('font', family='NanumBarunGothic') 

import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

<br>
<br>

## 1. 훈련 데이터 이해

<br>

### 1-1. 데이터 로드

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7f8c877cf0d0>)

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
print('훈련용 리뷰 개수 :',len(train_data))

훈련용 리뷰 개수 : 150000


In [None]:
train_data[:5]

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [None]:
print('테스트용 리뷰 개수 :',len(test_data)) 

테스트용 리뷰 개수 : 50000


In [None]:
test_data[:5]

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


<br>

### 1-2. 데이터 정제


중복 데이터를 확인하고 제거해준다.

In [None]:
train_data['document'].nunique(), train_data['label'].nunique()

(146182, 2)

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True) 

In [None]:
print('총 샘플의 수 :',len(train_data))

총 샘플의 수 : 146183


In [None]:
print(train_data.groupby('label').size().reset_index(name = 'count'))

   label  count
0      0  73342
1      1  72841


In [None]:
print(train_data.insull().sum())

id          0
document    1
label       0
dtype: int64


Null 값을 가지는 index 확인하고 제거한다.

In [None]:
train_data.loc[train_data.document.isnull()]

Unnamed: 0,id,document,label
25857,2172111,,1


In [None]:
train_data = train_data.dropna(how = 'any') 
print(train_data.isnull().values.any()) 

False


In [None]:
print(len(train_data))

146182


<br>

### 1-3. 데이터 전처리

한글과 공백 제외 후 모두 제거한다.

In [None]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data[:5]

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [None]:
train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train_data['document'].replace('', np.nan, inplace=True)
print(train_data.isnull().sum())

id            0
document    789
label         0
dtype: int64


In [None]:
train_data.loc[train_data.document.isnull()][:5]

Unnamed: 0,id,document,label
404,4221289,,0
412,9509970,,1
470,10147571,,1
584,7117896,,0
593,6478189,,0


In [None]:
# Null 값 제거
train_data = train_data.dropna(how = 'any')
print(len(train_data))

145393


In [None]:
test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['document'] = test_data['document'].str.replace('^ +', "") # 공백은 empty 값으로 변경
test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(how='any') # Null 값 제거
print('전처리 후 테스트용 샘플의 개수 :',len(test_data))

전처리 후 테스트용 샘플의 개수 : 48852


<br>

### 1-4. 토큰화

토큰화한 후 불용어를 설정하여 제거한다.

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
okt = Okt()
okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)

['오다', '이렇다', '것', '도', '영화', '라고', '차라리', '뮤직비디오', '를', '만들다', '게', '나다', '뻔']

In [None]:
X_train = []
for sentence in train_data['document']:
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

In [None]:
print(X_train[:3])

In [None]:
X_test = []
for sentence in test_data['document']:
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
print(tokenizer.word_index)

<br>

### 1-5. 정수 인코딩

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기 :',vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[:3])

In [None]:
[[50, 454, 16, 260, 659], [933, 457, 41, 602, 1, 214, 1449, 24, 961, 675, 19], [386, 2444, 2315, 5671, 2, 222, 9]]

In [None]:
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

<br>

### 1-6. 빈 샘플 제거

In [None]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

In [None]:
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

In [None]:
print('리뷰의 최대 길이 :',max(len(l) for l in X_train))
print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

<br>

### 1-7. 패딩

샘플들의 길이를 동일하게 맞추기 위해 패딩 작업을 진행한다.

In [None]:
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
        cnt = cnt + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

In [None]:
max_len = 30
below_threshold_len(max_len, X_train)

In [None]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

<br>
<br>

## 2. LSTM으로 감성 분류

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

15회 에포크를 수행하여 모델을 학습시킨다.

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.84680, saving model to best_model.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.84680 to 0.85344, saving model to best_model.h5
Epoch 3/15

Epoch 00003: val_acc improved from 0.85344 to 0.85857, saving model to best_model.h5
Epoch 4/15

Epoch 00004: val_acc improved from 0.85857 to 0.86064, saving model to best_model.h5
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.86064
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.86064
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.86064
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.86064
Epoch 00008: early stopping


In [None]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8561


<br>
<br>

## 3. 상권 감성 분석
네이버 블로그 크롤링한 데이터를 기반으로 각 지역에 대한 감성분석을 실시한 후, 긍정 비율을 저장한다.

<br>

### 3-1. 0번 상권 감성분석

In [None]:
# 지역별 csv 읽기
area0 = pd.read_csv("gdrive/MyDrive/pjt-seoul-market-analysis/data/area0_crawl.csv")

In [None]:
from collections import defaultdict
# 긍정 리뷰 개수만 구하면됨
cnt = defaultdict(int)

defaultdict(int, {})

In [None]:
# 한글과 공백 제외 후 모두 제거
area0['title'] = area0['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") 
print(area0.head())

  name                     title
0  쌍문동            여기다 싶었던 쌍문동 맛집
1  쌍문동  도봉구신축빌라 고급스럽고 넓은 쌍문동 방매매
2  쌍문동  쌍문동 조용한 주택가에 위치한 신축빌라 소개
3  쌍문동     쌍문역 카페 분위기 좋았던 쌍문동 커피
4  쌍문동         술이 술술 들어갔던 쌍문동 맛집


In [None]:
# 지역 개수
length = area0['name'].nunique()
length

78

In [None]:
# 제목 감정 분석
for i in range(length):
  positive_cnt = 0
  base = 1000 * i
  region = area0['name'][base]
  for title in area0['title'][base:base+1000]:
    new_sentence = okt.morphs(title, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
      positive_cnt  += 1
  # dict에다가 저장
  cnt[region] = positive_cnt
  print(cnt)

In [None]:
# csv에 추가
area0 = pd.read_csv('gdrive/MyDrive/pjt-seoul-market-analysis/data/감성분석키워드_area0.csv')
area0.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
for i in range(len(area0)):
  area0['긍정 리뷰 비율'][i] = 0.001 * cnt[area0['상권 분석 키워드'][i]]

area0.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,0.474,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,0.46,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,0.474,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,0.401,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,0.401,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
area0.to_csv('gdrive/MyDrive/pjt-seoul-market-analysis/data/final_area0.csv')

<br>

### 3-2. 1번 상권 감성분석

In [None]:
# 지역별 csv 읽기
area1 = pd.read_csv("gdrive/MyDrive/pjt-seoul-market-analysis/data/area1_crawl.csv")

In [None]:
from collections import defaultdict
# 긍정 리뷰 개수만 구하면됨
cnt = defaultdict(int)

defaultdict(int, {})

In [None]:
# 한글과 공백 제외 후 모두 제거
area1['title'] = area1['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") 
print(area1.head())

  name                     title
0  쌍문동            여기다 싶었던 쌍문동 맛집
1  쌍문동  도봉구신축빌라 고급스럽고 넓은 쌍문동 방매매
2  쌍문동  쌍문동 조용한 주택가에 위치한 신축빌라 소개
3  쌍문동     쌍문역 카페 분위기 좋았던 쌍문동 커피
4  쌍문동         술이 술술 들어갔던 쌍문동 맛집


In [None]:
# 지역 개수
length = area1['name'].nunique()
length

78

In [None]:
# 제목 감정 분석
for i in range(length):
  positive_cnt = 0
  base = 1000 * i
  region = area1['name'][base]
  for title in area1['title'][base:base+1000]:
    new_sentence = okt.morphs(title, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
      positive_cnt  += 1
  # dict에다가 저장
  cnt[region] = positive_cnt
  print(cnt)

In [None]:
# csv에 추가
area1 = pd.read_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area1.csv')
area1.head()

Unnamed: 0.1,Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동
0,0,가로공원로58길,0.093507,0.002016,0.012739,0.007389,0.0,0.0,0.235266,1,1147058000,신월3동
1,1,가로공원로76가길,0.136968,0.00378,0.161359,0.019704,0.0,0.0,0.312747,1,1150054000,화곡제1동
2,2,가로공원로80길,0.11764,0.006117,0.180467,0.007389,0.0,0.0,0.281615,1,1150054000,화곡제1동
3,7,가산로5길,0.132014,0.011363,0.063694,0.029557,0.181818,0.0,0.272768,1,1154551000,가산동
4,8,가재울로6길,0.150733,0.00378,0.091295,0.036946,0.045455,0.0,0.241588,1,1141070000,남가좌제2동


In [None]:
area1['긍정 리뷰 비율'] = ['1'] * len(area1)

In [None]:
for i in range(len(area1)):
  if area1['행정동'][i] in cnt.keys():
    area1['긍정 리뷰 비율'][i] = 0.001 * cnt[area1['행정동'][i]]

area1.head()

Unnamed: 0.1,Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동,긍정 리뷰 비율
0,0,가로공원로58길,0.093507,0.002016,0.012739,0.007389,0.0,0.0,0.235266,1,1147058000,신월3동,0.399
1,1,가로공원로76가길,0.136968,0.00378,0.161359,0.019704,0.0,0.0,0.312747,1,1150054000,화곡제1동,0.339
2,2,가로공원로80길,0.11764,0.006117,0.180467,0.007389,0.0,0.0,0.281615,1,1150054000,화곡제1동,0.339
3,7,가산로5길,0.132014,0.011363,0.063694,0.029557,0.181818,0.0,0.272768,1,1154551000,가산동,0.371
4,8,가재울로6길,0.150733,0.00378,0.091295,0.036946,0.045455,0.0,0.241588,1,1141070000,남가좌제2동,1.0


In [None]:
area1.to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/final_area1.csv')

<br>

### 3-3. 2번 상권 감성분석

In [None]:
# 지역별 csv 읽기
area2 = pd.read_csv("gdrive/MyDrive/pjt-seoul-market-analysis/data/area2_crawl.csv")

In [None]:
from collections import defaultdict
# 긍정 리뷰 개수만 구하면됨
cnt = defaultdict(int)

defaultdict(int, {})

In [None]:
# 한글과 공백 제외 후 모두 제거
area2['title'] = area2['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") 
print(area2.head())

  name                     title
0  쌍문동            여기다 싶었던 쌍문동 맛집
1  쌍문동  도봉구신축빌라 고급스럽고 넓은 쌍문동 방매매
2  쌍문동  쌍문동 조용한 주택가에 위치한 신축빌라 소개
3  쌍문동     쌍문역 카페 분위기 좋았던 쌍문동 커피
4  쌍문동         술이 술술 들어갔던 쌍문동 맛집


In [None]:
# 지역 개수
length = area2['name'].nunique()
length

78

In [None]:
# 제목 감정 분석
for i in range(length):
  positive_cnt = 0
  base = 1000 * i
  region = area2['name'][base]
  for title in area2['title'][base:base+1000]:
    new_sentence = okt.morphs(title, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
      positive_cnt  += 1
  # dict에다가 저장
  cnt[region] = positive_cnt
  print(cnt)

In [None]:
# csv에 추가
area2 = pd.read_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/dong_data_area2.csv')
area2.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
area2['긍정 리뷰 비율'] = ['1'] * len(area2)

In [None]:
for i in range(len(area2)):
  area2['긍정 리뷰 비율'][i] = 0.001 * cnt[area2['행정동'][i]]

area2.head()

Unnamed: 0,상권_코드_명,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster,행정동_코드,행정동,긍정 리뷰 비율
0,가마산로61길,0.148102,0.001008,0.016985,0.009852,0.022727,0.0,0.161983,2,1156065000,신길제3동,0.392
1,가산로3길,0.041782,0.022817,0.021231,0.014778,0.113636,0.0,0.098591,2,1154561000,독산제1동,0.374
2,강남골목시장,0.052822,0.00378,0.008493,0.0,0.0,0.0,0.053743,2,1162072500,조원동,0.674
3,강남구 논현역_3,0.045345,0.056424,0.0,0.03202,0.068182,0.0,0.004558,2,1168052100,논현1동,0.416
4,강남구 신사역_1,0.082954,0.025314,0.025478,0.078818,0.045455,0.0,0.087372,2,1165054000,잠원동,0.498


In [None]:
area2.to_csv('/content/gdrive/MyDrive/pjt-seoul-market-analysis/data/final_area2.csv')

<br>

### 3-4. 3번 상권 감성분석

In [None]:
# 지역별 csv 읽기
area3 = pd.read_csv("gdrive/MyDrive/pjt-seoul-market-analysis/data/area3_crawl.csv")

In [None]:
from collections import defaultdict
# 긍정 리뷰 개수만 구하면됨
cnt = defaultdict(int)

defaultdict(int, {})

In [None]:
# 한글과 공백 제외 후 모두 제거
area3['title'] = area3['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") 
print(area3.head())

  name                     title
0  쌍문동            여기다 싶었던 쌍문동 맛집
1  쌍문동  도봉구신축빌라 고급스럽고 넓은 쌍문동 방매매
2  쌍문동  쌍문동 조용한 주택가에 위치한 신축빌라 소개
3  쌍문동     쌍문역 카페 분위기 좋았던 쌍문동 커피
4  쌍문동         술이 술술 들어갔던 쌍문동 맛집


In [None]:
# 지역 개수
length = area3['name'].nunique()
length

78

In [None]:
# 제목 감정 분석
for i in range(length):
  positive_cnt = 0
  base = 1000 * i
  region = area3['name'][base]
  for title in area3['title'][base:base+1000]:
    new_sentence = okt.morphs(title, stem=True) # 토큰화
    new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
    pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
    score = float(loaded_model.predict(pad_new)) # 예측
    if(score > 0.5):
      positive_cnt  += 1
  # dict에다가 저장
  cnt[region] = positive_cnt
  print(cnt)

In [None]:
# csv에 추가
area3 = pd.read_csv('gdrive/MyDrive/pjt-seoul-market-analysis/data/감성분석키워드_area3.csv')
area3.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
for i in range(len(area3)):
  area3['긍정 리뷰 비율'][i] = 0.001 * cnt[area3['상권 분석 키워드'][i]]

area3.head()

Unnamed: 0,상권_코드_명,상권 분석 키워드,긍정 리뷰 비율,총_생활인구_수,총_직장_인구_수,아파트_단지_수,집객시설_수,교통시설_수,학교_수,총_상주인구_수,cluster
0,갈현로33길,갈현동,0.474,0.195418,0.005773,0.165605,0.022167,0.045455,0.0,0.349827,3
1,강남대로8길,양재동,0.46,0.283412,0.042312,0.184713,0.017241,0.0,0.0,0.323406,3
2,강동대로53길,성내동,0.474,0.270944,0.024947,0.161359,0.029557,0.022727,0.0,0.362381,3
3,강서로15길,화곡동,0.401,0.297673,0.00591,0.282378,0.027094,0.045455,0.0,0.496732,3
4,강서로18길,화곡동,0.401,0.226992,0.015417,0.4862,0.046798,0.113636,0.0,0.660748,3


In [None]:
area3.to_csv('gdrive/MyDrive/pjt-seoul-market-analysis/data/final_area3.csv')