In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import requests
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from hanspell import spell_checker
from konlpy.tag import Okt, Hannanum
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from krwordrank.sentence import summarize_with_sentences
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib
from wordcloud import WordCloud
from collections import Counter
from sklearn.linear_model import LogisticRegression
import xgboost
import lightgbm as lgb
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.metrics import mean_absolute_error
from IPython.display import Image
from mlxtend.plotting import scatterplotmatrix

In [None]:
sonata = pd.read_csv('/Users/parkjubro/Desktop/파이널/합본1/현대+쏘나타_label.csv') # 데이터 불러오기
sonata = sonata.drop('Unnamed: 0', axis = 'columns')
sonata.head(5)

In [None]:
okt = Okt()

#이모티콘 제거 (아이폰 이모티콘들은 따로 코드가 존재)
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00010000-\U0010FFFF"                   
                           "]+", flags=re.UNICODE)

#분석에 어긋나는 불용어구 제외 (특수문자, 의성어)
han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d\xa0]')

# train data(sonata)에 있는 댓글들 전처리를 위해 리스트화
comment_list = [] 
for i in range(len(sonata)):
    comment_list.append(sonata['comments'].iloc[i])
    
# 전처리 1 (댓글 이모지 밑 기호 정리 후 정규화)
comment_result = [] 

for i in comment_list:
    tokens = re.sub(emoji_pattern,"",i) # 이모지 패턴 적용
    tokens = re.sub(han,"",tokens) # han 적용
    tokens = re.sub('[-=+,#/\?:^.@*\"※%~∼ㆍ!【】』㈜©囹圄秋 ■◆◇▷▶◁◀ △▲▽▼<>‘|\(\)\[\]`\'…》→←↑↓↔〓♤♠♡♥♧♣⊙◈▣◐◑☆★\”\“\’·※~ ! @ # $ % ^ & * \ " ]', ' ', tokens)
    # 기타 특수문자들 제거
    tokens = okt.normalize(tokens) # 정규화
    comment_result.append(tokens)

comment_result

# 전처리 2 (spell_checker 활용하여 오탈자 수정)
checked_list = [] 
for comment in tqdm(comment_result):
    sent = comment
    try:
        spelled_sent = spell_checker.check(sent)
        checked_sent = spelled_sent.checked
        checked_list.append(checked_sent)
    except:
        print(sent)
        checked_list.append(sent)

# 학습을 위해 댓글과 라벨 컬럼만 있는 새로운 샘플 데이터프레임 생성
sample = pd.DataFrame() 
sample['comments'] = checked_list
sample['label'] = sonata['label']

In [None]:
# trian data 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)

# w2index
word_to_index = tokenizer.word_index

In [None]:
# 댓글 최대 길이에 따라 훈련 데이터 크기 설정 
long = max(len(sample) for sample in X_train_encoded) # 전처리 과정에서 댓글 최대 길이가 가끔 달라지기 때문에 long 변수 설정함
max_len = long
X_train_padded = pad_sequences(X_train_encoded, maxlen = max_len)

# long = 137
# 훈련 데이터 크기 : (2429, 137)

In [None]:
# 모델링


embedding_dim = 32
dropout_ratio = 0.3
num_filters = 32
kernel_size = 4

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_ratio)) # 과적합 방지를 위해 validation loss가 3번 증가할 때 자동으로 학습 중지
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(X_train_padded, y_train, epochs=13, batch_size=137, validation_split=0.2, callbacks=[es, mc])

In [None]:
test_df = pd.read_csv('/Users/parkjubro/Desktop/파이널/데이터들/test.csv').astype(str)
x_test = test_df['comments'].tolist()

In [None]:
vocab_size = 30000
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_test)
X_test_encoded = tokenizer.texts_to_sequences(x_test)

In [None]:
test_long = max(len(sample) for sample in X_test_encoded)
X_test_padded = pad_sequences(X_test_encoded, maxlen = test_long)

In [None]:
test_df['label'] = prediction
test_df = test_df.drop(columns = 'Unnamed: 0', axis = 1)
test_df

In [None]:


texts = test_df['comments'].tolist()
penalty = lambda x:0 if (10 <= len(x) <= 150) else 1
stopwords = {'너무', '리뷰', '진짜', '기자님', '정말', '많이', '영상', '보고', 
             '이번', '같은', '그냥', '시승기', '보면', '하는', '김한용', '근데', 
             '아니', '다른', '역시', '항상', '한상기', '라이', '있는', '생각', 
             '정도', '나오', '지금', '같아', '이제', '있습니다', '요즘', '그리고',
            '어떤', '이쁘', '보이', '보니', '때문에', '솔직히', '뭔가', '설명',
            '같네요', '하고', '감사', '특히', '봤습니', '정도', '차가', '차를', 
             '엄청', '이런', '보는', '훨씬', '까요', '아닌', '20', '현대', '벤츠',
             '없는', '현기', '한국', '그런', '10', '아반떼', '운전', '하나', '저도',
             '저는', '이거', '소리', '이상', '그래', '차는', '있어', '이렇게', '아직',
             '들어', '만들', '타는', '합니다', '우리', '차이' ,'일본', '아우디', '볼보', 
            '갑니다', '어떻게', '다시', '구매', '제가', '가장', '조금', '하면', '미국',
             '국내', '한번', '기아', '신형', '그랜저', '쌍용', '30', '거의', '하지', 'the',
             '스포', '사는', '많은', '봤습니다', '없어', '바로', '되는', '혹시', '계속',
             '확실히', '같네', '텐데', '건가', '그렇', '오늘', '무슨', '국산', '해도', '없고',
            '광고', '대한', '아주', '싶네요', '부분', '얼마나', '제일',' 아무리', '궁금', '모르',
            '개인', '아무리', '타고', '것도', '나온'}

keywords, sents = summarize_with_sentences(
    texts,
    penalty=penalty,
    stopwords = stopwords,
    diversity=0.5,
    num_keywords=100,
    num_keysents=10, # 키워드 분석을 통해 가장 핵심적인 문장들 뽑아내는 개수
    verbose=False
)
for word, r in sorted(keywords.items(), key = lambda x:x[1], reverse=True)[:30]:
                     print('%8s:\t%.4f' % (word, r))

In [None]:
new_label = []
for comment in sample['comments']:
    if comment in checked_list:
        new_label.append(0)
    else:
        new_label.append(1)

sonata['new_label'] = new_label
sonata['new_label'].value_counts()

# 1    1556
# 0    1481

In [None]:
car_df = pd.read_csv('/Users/parkjubro/Desktop/파이널/데이터들/preprocessed_kb_0518.csv')
df2 = car_df.drop_duplicates() # 중복값 제거
df3 = df2[['car_name','depreciation','year','use','mileage','car_type','insurance']] # 차량모델, 가격, 연식, 사용연수, 주행거리, 차종, 사고여부만 뽑음 (중요한 영향변수)
df3 = df3.reset_index() # 인덱스 초기화
df3 = df3.drop(columns = 'index', axis = 1)
df3

In [None]:


std = StandardScaler()

train_scaled = std.fit_transform(df5)

In [None]:


x_data = df5.drop(columns = 'depreciation')
y_data = df5['depreciation']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2 , random_state = 0)

In [None]:

# 선형회귀
linear = LinearRegression()
linear.fit(x_train, y_train)

linear.score(x_train, y_train)
# 0.9145135460000929



cross_val_score(linear, x_train, y_train, cv = 3)

# array([0.90714164, 0.90488488, 0.90672457])

linear.score(x_test, y_test)

# -67864608.4408464 -> 훈련값은 정확하나 예측값은 매우 부정확

In [None]:

rfr = RandomForestRegressor(random_state = 0)
rfr.fit(x_train, y_train)
rfr.score(x_train, y_train)

# 0.9887830386703866

cross_val_score(rfr, x_train, y_train, cv = 3)

# array([0.91676753, 0.91665102, 0.91727352])

rfr.score(x_test, y_test)

# 0.9173439359185369

In [None]:
#수치형 변수들만 가지고 모델링
#kb에 use, mileage, year, new_price, depreciation, forecast_min, forecast_max, car_cc 포함
y=kb[['price']].to_numpy()
kb=kb.drop(columns=['price','trans','loss','flood','usage','change','insurance','sales_corp','sales_loca','options','car_area','car_no','car_brand','car_name','name_datailed','fuel','car_type','color'])

In [None]:
#test, train data 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85, random_state=1)

lr = LinearRegression(fit_intercept = True, normalize= True, copy_X=True)
lr.fit(x_train, y_train)

In [None]:
gb = GradientBoostingRegressor(min_samples_leaf=10, min_samples_split=5,learning_rate=0.5,max_depth=3, n_estimators=1000)
gb.fit(x_train, y_train)

In [None]:
#다중회귀
lm = LinearRegression()
X = kb[['mileage']]
Y = kb['price']
Z = kb[['year', 'use', 'depreciation', 'new_price']]
lm.fit(X,Y)
lm.fit(Z, kb['price'])
Y = lm.predict(Z)

In [None]:
plt.figure(figsize=(width, height))

ax1 = sns.distplot(kb['price'], hist=False, color="r", label="Actual Value")
sns.distplot(Y, hist=False, color="b", label="Fitted Values" , ax=ax1)

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.show()
plt.close()

In [None]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5):
    labeling = str("{:.2f}% 확률로 긍정 댓글입니다.\n".format(score * 100))
  else:
    labeling = str("{:.2f}% 확률로 부정 댓글입니다.\n".format((1 - score) * 100))
  return labeling

In [None]:
words = youtube_df['comments']
youtube_df['labeling'] = ''

for idx, comment in enumerate(words):
  if '궁금' in comment: # '?' 추가할까? 고민해보기
    youtube_df['labeling'][idx] = -1
    print(youtube_df.iloc[idx])
  else:
    labeling = sentiment_predict(comment)
    if '긍정' in labeling:  
      youtube_df['labeling'][idx] = 0
      print(youtube_df.iloc[idx])
    elif '부정' in labeling:
      youtube_df['labeling'][idx] = 1
      print(youtube_df.iloc[idx])

In [None]:
youtube_df['comments'] = youtube_df['comments'].str.replace("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
youtube_df['comments']

In [None]:
hanspell_sent_lst = []

for i in words[:100]:
    
  spelled_sent = spell_checker.check(i) # 맞춤법 검사
  hanspell_sent = spelled_sent.checked # 띄어쓰기 교정
  hanspell_sent_lst.append(hanspell_sent)

  print(hanspell_sent)

In [None]:
# 불용어 설정

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

han = Hannanum()

youtube_review_han_nouns = []

for i in hanspell_sent_lst:
    tokenized_sentence = han.nouns(i)
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    youtube_review_han_nouns.append(stopwords_removed_sentence)

    # 글자수 한자리 단어, 'ㅋ'이 포함된 단어은 불용어 사전에 넣기
    for j in stopwords_removed_sentence:
      if len(j) <= 1 or 'ㅋ' in j:
        stopwords.append(j)

In [None]:
print(stopwords)

# 결과
['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다', '백', '차', '대', '뒤', '나', '것', '편', '걸', '전', '보', '듯', '님', '중', '뭐', '저', '저', '둘', '유', '데', '수', '리', '카', '그', '파', '지', '형', '화', '첨', '거', '두', '후', '킹', '개', '욕', '잠', '확', '쪽', 'ㅋ', '흠', '되', '4', '옆', '돈', '타', '칸', '눈', '내', '줄', '분', '손', '등', '동', '체', '너', 'ㅎ', '낫', '생', '밖', '맛', '드', '앞', '핫', 'ㅋㅋㅋ', 'ㅋㅋㅋㅋㅋ', '궁금하네욬ㅋ', '있을깤ㅋ', '못해욧ㅋㅋㅋㅋ', '로맠ㅋㅋ', '2억ㅋ', '소문잌ㅋ']


print(youtube_review_han_nouns)

# 결과
[['파나메라', '패스트', '스타일'], ['진짜', '최고다ㅠㅠ타보'], [], ['기업', '총수', '대형', '세단', '기사님', '운전'], ['날씨', '고생', '영상', '감사'], ['간결', '설명'], ['1255', '이차', '전동', '트렁크', '특유', '과감', '한상기', '트렁크'], ['ㅎㅎㅎ'], ['잔고장', '정도'], ['최고'], ['전면', '디자인', '터보', '전용', '일반', '파나메라', '스포츠', '범퍼', '옵션', '선택', '궁금'], ['날씨'], ['며칠', '시내', '주행', '결함', '신형', '포르쉐'], ['진짜', '파라네라', '완전', '실용적', 'ㅜㅜ'], ['한상기', '400d', '이젝큐티브', '선택', '이번', '구입', '10년', '이상', '생각', '시간', '디젤'], ['상기형', '리뷰'], ['만약', '포르쉐', '911', '근데', '파나메라', '진짜', '이쁘긴하다ㅠㅠ'], ['형님', '리뷰', '감사'], ['궁금', '20년식', '파나메라', '하이브리드', '기본인가요'], ['포르쉐'], ['1753', '1839', '한상기', '코너링'], ['뒷좌석', '터널', '공간', '저걸', '이유', '뭔가요'], ['운전석', '그날'], ['휴가', '때문', '리플', '수고'], ['포르쉐', '네오'], ['옵션', '2억', '이거', '15억', '설명', '옵션', '깡통', 'ㅠㅠ'], ['포르쉐', '형님', '볼게요'], ['정식', '수입', '직수', '차량', '한글화', '프로그램', '업데이트하', '파노라마'], ['클레스랑', '가격', '비슷', '추천하시'], ['이그제큐티브', '익제큐티브', '시승기'], ['8시리즈', '쿠페', '40i랑', '비교', '생각', '승차감', '운동성능이요'], ['0609', '로마', '주변', '한눈', '파노라마', '기능', '오타', '맞는듯싶네요'], ['포르쉐', '남자', '로망'], ['포르쉐', '신형', '아우디'], ['디자인', '현대차'], ['파나메', '시승', '구매', '이걸', '레인지로버', '보그', '이후', '극찬'], ['오우', '3시간'], ['기본', '옵션'], ['파나메라', '기존', '세그먼트', '세단', '클래스', '비교', '파나메라', '장점', '단점', '가지', '있을까요'], ['한국', '주차라인', '언제쯤'], ['아이폰', '처음', '기자님', '영상', '보네욯ㅎ'], ['파노라마'], ['롱휠베이스', '뒷자리', '승차감'], [], ['포르쉐', '파나메'], ['오오', '독국', '스팅어'], ['진짜', '정도', 'ㅡㅡ진짜'], ['14년식', '12', '파나메라', '중고차', '구매', '고장', '나긴', '정도', '가서요'], ['상기', '뮤ㅠ'], ['뭔가', '옛날', '모습', '발전', '무엇'], ['043', '매미', '리듬감', '뭔데'], ['파노라마'], [], ['장거리', '주행', '파나메', '스포츠', '투리스모', '아우', 'RS6', '가요'], ['파노라마', '1억', '5천'], ['상기', '뮤ㅠ'], ['번창하세요'], ['2억', '1억', '9천500'], ['2열', '리클라이닝'], ['근데', '가격', '시승', '차량', '1억', '8천', '1억', '5천', '같은데요'], ['파나메라', '10주년', '보군요'], ['고급', '세단', '카이엔보단', '조용해야조ㅡㅡ'], ['오늘', '계약금', '이그제큐티브'], ['330마력', '15억'], ['기본', '뒷좌석', '매력'], ['후방', '카메라', '해상', '실화'], ['1627', '볼트'], ['정도'], ['전기차', '시대', '성능', '디자인과', '브랜드'], ['형님', '뒷자리', '장시간', '답답', '궁금'], ['가격', '진짜'], ['포르쉐', '할인'], ['포르쉐', '파나메라', '시승기'], ['시승차량', '현금', '구매', '방법', '없겠죠ㅠ'], [], ['올림픽', '폐막식', '감사'], ['1220', '눈앞', '날파리'], ['지금', '조선', '파나메라', '대리만족', 'ㅜㅜ'], ['세단용', '포르쉐'], ['포르쉐', '실내', '앰비언트', '무드', '포르쉐', '안사', '마티즈', '이유'], [], ['오늘'], ['제로백', '52초', '아반떼'], ['718'], [], ['파나메라야', '미안'], ['선댓후감'], ['후방카메라'], ['파노라마'], ['검수', '하나'], ['1억', '오천', '벤츠', '가격', '생각'], ['이번'], ['옵션'], ['1억', '5천'], [], ['포르쉐', '점심'], ['1억', 'ㅎㅎ'], ['파나메라'], ['이걸'], ['돈값']]

In [None]:
word_list = sum(youtube_review_han_nouns, [])
count = Counter(word_list)
word_count = dict(count.most_common())

# 로컬에 있는 폰트를 사용할 경우, 폰트의 경로를 font_path에 추가 해주시면 됩니다.
wc = WordCloud(font_path=fontpath, background_color = 'white',colormap=matplotlib.cm.inferno,  max_words=100, width=800, height=800, prefer_horizontal = True)
cloud = wc.fit_words(word_count)
cloud.to_image()

In [None]:
x = car_df.drop(columns=['price','car_area','car_no', 'car_brand', 'car_name', 'name_datailed', 'fuel', 'car_type', 'color', 'trans', 'loss', 'flood', 'usage','insurance', 'sales_corp', 'sales_loca', 'options'])
y = car_df[['price']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 11)

In [None]:
lr = LinearRegression(fit_intercept=True, normalize = True, copy_X = True)
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
print('LinearRegression trian 정확도 :', lr.score(x_train, y_train))
print('LinearRegression test 정확도:', lr.score(x_test, y_test))
print('LinearRegression test 정확도:', r2_score(y_test, lr.predict(x_test)))

In [None]:
mean_absolute_error(y_test, y_predict)

In [None]:
gb = GradientBoostingRegressor(min_samples_leaf=10, min_samples_split=5, learning_rate=0.5, max_depth=3, n_estimators=1000)
gb.fit(x_train, y_train)
y_gb_predict = gb.predict(x_test)

print('gb train 정확도:', gb.score(x_train, y_train))
print('gb test 정확도:', gb.score(x_test, y_test))
print('gb test 정확도:', r2_score(y_test, gb.predict(x_test)))

In [None]:
mean_absolute_error(y_test, y_gb_predict)

In [None]:
rf_clf = RandomForestRegressor(n_estimators = 50, random_state  = 42)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)

print('RandomForest train 정확도:', rf_clf.score(x_train, y_train))
print('RandomForest test 정확도:', rf_clf.score(x_test, y_test))
print('RandomForest test 정확도:', r2_score(y_test, rf_clf.predict(x_test)))

In [None]:
mean_absolute_error(y_test, pred)

In [None]:
xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

xgb_model.fit(x_train,y_train)
predictions = xgb_model.predict(x_test)

r_sq = xgb_model.score(x_train, y_train)
print('XBoost train 정확도: ',r_sq)
print('XBoost test 정확도: ',explained_variance_score(predictions,y_test))

In [None]:
mean_absolute_error(y_test, predictions)