In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import requests
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from hanspell import spell_checker
from konlpy.tag import Okt, Hannanum
import urllib.request
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D, LSTM
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from krwordrank.sentence import summarize_with_sentences
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib
from wordcloud import WordCloud
from collections import Counter
from sklearn.linear_model import LogisticRegression
import xgboost
import lightgbm as lgb
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from sklearn.metrics import mean_absolute_error
from IPython.display import Image
from mlxtend.plotting import scatterplotmatrix

In [None]:
sonata = pd.read_csv('/Users/parkjubro/Desktop/파이널/합본1/현대+쏘나타_label.csv')
sonata = sonata.drop('Unnamed: 0', axis = 'columns')

okt = Okt()

def preprocessing1(text):

    #이모티콘 제거 (아이폰 이모티콘들은 따로 코드가 존재)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00010000-\U0010FFFF"                   
                           "]+", flags=re.UNICODE)

    #분석에 어긋나는 불용어구 제외 (특수문자, 의성어)
    han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d\xa0]')
    
    # 전처리 1 (댓글 이모지 밑 기호 정리 후 정규화)
    comment_result = [] 

    for i in text:
        tokens = re.sub(emoji_pattern,"",i) # 이모지 패턴 적용
        tokens = re.sub(han,"",tokens) # han 적용
        tokens = re.sub('[-=+,#/\?:^.@*\"※%~∼ㆍ!【】』㈜©囹圄秋 ■◆◇▷▶◁◀ △▲▽▼<>‘|\(\)\[\]`\'…》→←↑↓↔〓♤♠♡♥♧♣⊙◈▣◐◑☆★\”\“\’·※~ ! @ # $ % ^ & * \ " ]', ' ', tokens)
        # 기타 특수문자들 제거
        tokens = okt.normalize(tokens) # 정규화
        comment_result.append(tokens)

    # 전처리 2 (spell_checker 활용하여 오탈자 수정)
    checked_list = [] 
    
    for comment in tqdm(comment_result):
        sent = comment
        try:
            spelled_sent = spell_checker.check(sent)
            checked_sent = spelled_sent.checked
            checked_list.append(checked_sent)
        except:
            print(sent)
            checked_list.append(sent)
    
    return checked_list

In [None]:
checked_list = preprocessing1(list(sonata['comments']))

In [None]:
# 학습을 위해 댓글과 라벨 컬럼만 있는 새로운 샘플 데이터프레임 생성
sample = pd.DataFrame() 
sample['comments'] = checked_list
sample['label'] = sonata['label']
# trian data 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)

# w2index
word_to_index = tokenizer.word_index

# 댓글 최대 길이에 따라 훈련 데이터 크기 설정 
long = max(len(sample) for sample in X_train_encoded) # 전처리 과정에서 댓글 최대 길이가 가끔 달라지기 때문에 long 변수 설정함
max_len = long
X_train_padded = pad_sequences(X_train_encoded, maxlen = max_len)

In [None]:
embedding_dim = 32
dropout_ratio = 0.3
num_filters = 32
kernel_size = 4

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_ratio)) # 과적합 방지를 위해 validation loss가 3번 증가할 때 자동으로 학습 중지
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode='max', verbose=1, save_best_only=True)

history = model.fit(X_train_padded, y_train, epochs=13, batch_size=137, validation_split=0.2, callbacks=[es, mc])

In [None]:
test_df = pd.read_csv('/Users/parkjubro/Desktop/파이널/데이터들/test.csv').astype(str)
x_test = test_df['comments'].tolist()

vocab_size = 30000
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_test)
X_test_encoded = tokenizer.texts_to_sequences(x_test)

test_long = max(len(sample) for sample in X_test_encoded)
X_test_padded = pad_sequences(X_test_encoded, maxlen = test_long)

test_df['label'] = prediction
test_df = test_df.drop(columns = 'Unnamed: 0', axis = 1)

In [None]:


texts = test_df['comments'].tolist()
penalty = lambda x:0 if (10 <= len(x) <= 150) else 1
stopwords = {'너무', '리뷰', '진짜', '기자님', '정말', '많이', '영상', '보고', 
             '이번', '같은', '그냥', '시승기', '보면', '하는', '김한용', '근데', 
             '아니', '다른', '역시', '항상', '한상기', '라이', '있는', '생각', 
             '정도', '나오', '지금', '같아', '이제', '있습니다', '요즘', '그리고',
            '어떤', '이쁘', '보이', '보니', '때문에', '솔직히', '뭔가', '설명',
            '같네요', '하고', '감사', '특히', '봤습니', '정도', '차가', '차를', 
             '엄청', '이런', '보는', '훨씬', '까요', '아닌', '20', '현대', '벤츠',
             '없는', '현기', '한국', '그런', '10', '아반떼', '운전', '하나', '저도',
             '저는', '이거', '소리', '이상', '그래', '차는', '있어', '이렇게', '아직',
             '들어', '만들', '타는', '합니다', '우리', '차이' ,'일본', '아우디', '볼보', 
            '갑니다', '어떻게', '다시', '구매', '제가', '가장', '조금', '하면', '미국',
             '국내', '한번', '기아', '신형', '그랜저', '쌍용', '30', '거의', '하지', 'the',
             '스포', '사는', '많은', '봤습니다', '없어', '바로', '되는', '혹시', '계속',
             '확실히', '같네', '텐데', '건가', '그렇', '오늘', '무슨', '국산', '해도', '없고',
            '광고', '대한', '아주', '싶네요', '부분', '얼마나', '제일',' 아무리', '궁금', '모르',
            '개인', '아무리', '타고', '것도', '나온'}

keywords, sents = summarize_with_sentences(
    texts,
    penalty=penalty,
    stopwords = stopwords,
    diversity=0.5,
    num_keywords=100,
    num_keysents=10, # 키워드 분석을 통해 가장 핵심적인 문장들 뽑아내는 개수
    verbose=False
)
for word, r in sorted(keywords.items(), key = lambda x:x[1], reverse=True)[:30]:
                     print('%8s:\t%.4f' % (word, r))
                     new_label = []
                     
for comment in sample['comments']:
    if comment in checked_list:
        new_label.append(0)
    else:
        new_label.append(1)

sonata['new_label'] = new_label
sonata['new_label'].value_counts()

In [None]:
car_df = pd.read_csv('/Users/parkjubro/Desktop/파이널/데이터들/preprocessed_kb_0518.csv')
df2 = car_df.drop_duplicates() # 중복값 제거
df3 = df2[['car_name','depreciation','year','use','mileage','car_type','insurance']] # 차량모델, 가격, 연식, 사용연수, 주행거리, 차종, 사고여부만 뽑음 (중요한 영향변수)
df3 = df3.reset_index() # 인덱스 초기화
df3 = df3.drop(columns = 'index', axis = 1)
df5 = pd.get_dummies(data = df4, columns = ['car_type', 'car_name', 'insurance']) # 차 종류, 차량 모델, 사고여부 원-핫인코딩

std = StandardScaler()

train_scaled = std.fit_transform(df5)

x_data = df5.drop(columns = 'depreciation')
y_data = df5['depreciation']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2 , random_state = 0)

linear = LinearRegression()
linear.fit(x_train, y_train)
rfr = RandomForestRegressor(random_state = 0)
rfr.fit(x_train, y_train)

print(linear.score(x_train, y_train))
print(cross_val_score(linear, x_train, y_train, cv = 3))
print(linear.score(x_test, y_test))
print(rfr.score(x_train, y_train))
print(cross_val_score(rfr, x_train, y_train, cv = 3))
print(rfr.score(x_test, y_test))

In [None]:
#수치형 변수들만 가지고 모델링
#kb에 use, mileage, year, new_price, depreciation, forecast_min, forecast_max, car_cc 포함
y=kb[['price']].to_numpy()
kb=kb.drop(columns=['price','trans','loss','flood','usage','change','insurance','sales_corp','sales_loca','options','car_area','car_no','car_brand','car_name','name_datailed','fuel','car_type','color'])

In [None]:
#test, train data 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85, random_state=1)

lr = LinearRegression(fit_intercept = True, normalize= True, copy_X=True)
lr.fit(x_train, y_train)

gb = GradientBoostingRegressor(min_samples_leaf=10, min_samples_split=5,learning_rate=0.5,max_depth=3, n_estimators=1000)
gb.fit(x_train, y_train)

#다중회귀
lm = LinearRegression()
X = kb[['mileage']]
Y = kb['price']
Z = kb[['year', 'use', 'depreciation', 'new_price']]
lm.fit(X,Y)
lm.fit(Z, kb['price'])
Y = lm.predict(Z)

In [None]:
# 네이버 영화 리뷰 데이터
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data_movie = pd.read_table('ratings_train.txt')
test_data_movie = pd.read_table('ratings_test.txt')

# 네이버 쇼핑 리뷰 데이터
urllib.request.urlretrieve("https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt", filename="ratings_total.txt")

total_data = total_data.rename(columns={'reviews':'document'})
total_data['label'] = np.select([total_data.ratings > 3], [1], default=0)
train_data = pd.concat([train_data_movie,train_data_shopping])
train_data.reset_index()

# test_data 합치기
test_data = pd.concat([test_data_movie,test_data_shopping])
test_data = test_data.reset_index()

# document 열의 중복 제거
train_data.drop_duplicates(subset=['document'], inplace=True)

train_data.loc[train_data.document.isnull()]
train_data = train_data.dropna(subset=['document']) # Null 값이 존재하는 행 제거

# 한글과 공백을 제외하고 모두 제거
train_data['document'] = train_data['document'].str.replace("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

train_data['document'] = train_data['document'].str.replace('^ +', "") # white space 데이터를 empty value로 변경
train_data['document'].replace('', np.nan, inplace=True)
train_data = train_data.dropna(subset=['document'])

test_data.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
test_data['document'] = test_data['document'].str.replace("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
test_data['document'] = test_data['document'].str.replace('^ +', "") # 공백은 empty 값으로 변경
test_data['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
test_data = test_data.dropna(subset=['document']) # Null 값 제거

stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

okt = Okt()

X_train = []
for sentence in tqdm(train_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_train.append(stopwords_removed_sentence)

X_test = []
for sentence in tqdm(test_data['document']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    X_test.append(stopwords_removed_sentence)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

# 전체 단어 개수 중 빈도수 2이하인 단어는 제거.
# 0번 패딩 토큰을 고려하여 + 1
vocab_size = total_cnt - rare_cnt + 1

tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

# 빈 샘플들을 제거
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
        count = count + 1

max_len = 30
below_threshold_len(max_len, X_train)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5):
    labeling = 1
  else:
    labeling = 0

  return labeling

In [None]:
def labeling(df,df_col):
    
    df['labeling'] = ''

    for idx, comment in enumerate(df_col):
        labeling = sentiment_predict(comment)
        df['labeling'][idx] = labeling

    df['comments'] = df['comments'].str.replace("[^A-Za-z0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
    
    return df['comments']

In [None]:
youtube_df = labeling(youtube_df, youtube_df['comments'])

In [None]:
def filtering(df_col):
  hanspell_sent_lst = []

  for i in df_col[:100]:
      
    spelled_sent = spell_checker.check(i) # 맞춤법 검사
    hanspell_sent = spelled_sent.checked # 띄어쓰기 교정
    hanspell_sent_lst.append(hanspell_sent)

  # 불용어 설정
  stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

  han = Hannanum()

  han_nouns = []

  for i in hanspell_sent_lst:
      tokenized_sentence = han.nouns(i)
      stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
      han_nouns.append(stopwords_removed_sentence)

      # 글자수 한자리 단어, 'ㅋ'이 포함된 단어은 불용어 사전에 넣기
      for j in stopwords_removed_sentence:
        if len(j) <= 1 or 'ㅋ' in j:
          stopwords.append(j)
  
  return han_nouns

In [None]:
youtube_review_han_nouns = filtering(youtube_df['comments'])

In [None]:
word_list = sum(youtube_review_han_nouns, [])
count = Counter(word_list)
word_count = dict(count.most_common())

# 로컬에 있는 폰트를 사용할 경우, 폰트의 경로를 font_path에 추가 해주시면 됩니다.
wc = WordCloud(font_path=fontpath, background_color = 'white',colormap=matplotlib.cm.inferno,  max_words=100, width=800, height=800, prefer_horizontal = True)
cloud = wc.fit_words(word_count)
cloud.to_image()

In [None]:
x = car_df.drop(columns=['price','car_area','car_no', 'car_brand', 'car_name', 'name_datailed', 'fuel', 'car_type', 'color', 'trans', 'loss', 'flood', 'usage','insurance', 'sales_corp', 'sales_loca', 'options'])
y = car_df[['price']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 11)

In [None]:
lr = LinearRegression(fit_intercept=True, normalize = True, copy_X = True)
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
print('LinearRegression trian 정확도 :', lr.score(x_train, y_train))
print('LinearRegression test 정확도:', lr.score(x_test, y_test))
print('LinearRegression test 정확도:', r2_score(y_test, lr.predict(x_test)))

In [None]:
mean_absolute_error(y_test, y_predict)

In [None]:
gb = GradientBoostingRegressor(min_samples_leaf=10, min_samples_split=5, learning_rate=0.5, max_depth=3, n_estimators=1000)
gb.fit(x_train, y_train)
y_gb_predict = gb.predict(x_test)

print('gb train 정확도:', gb.score(x_train, y_train))
print('gb test 정확도:', gb.score(x_test, y_test))
print('gb test 정확도:', r2_score(y_test, gb.predict(x_test)))

In [None]:
mean_absolute_error(y_test, y_gb_predict)

In [None]:
rf_clf = RandomForestRegressor(n_estimators = 50, random_state  = 42)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)

print('RandomForest train 정확도:', rf_clf.score(x_train, y_train))
print('RandomForest test 정확도:', rf_clf.score(x_test, y_test))
print('RandomForest test 정확도:', r2_score(y_test, rf_clf.predict(x_test)))

In [None]:
mean_absolute_error(y_test, pred)

In [None]:
xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

xgb_model.fit(x_train,y_train)
predictions = xgb_model.predict(x_test)

r_sq = xgb_model.score(x_train, y_train)
print('XBoost train 정확도: ',r_sq)
print('XBoost test 정확도: ',explained_variance_score(predictions,y_test))

In [None]:
mean_absolute_error(y_test, predictions)