In [279]:
import numpy as np
import pandas as pd

from time import process_time
import time

import datetime 
import arrow

import sys
import os
import re 
import math

import tensorflow as tf
from scipy.linalg import norm

In [280]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
plt.rcParams["axes.unicode_minus"]=False #用來正常顯示負號

from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname="C:/Users/s0970/python/113碩二上讀書會/吳蒨芸/TaipeiSansTCBeta-Regular.ttf")

import seaborn as sns
sns.set(font="Microsoft JhengHei") #正常顯示中文

In [281]:
from nltk.tokenize import RegexpTokenizer #正則化斷詞
from nltk.tokenize import word_tokenize #斷詞

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

In [282]:
iceland = pd.read_csv("iceland_real.csv",index_col=0)
greece = pd.read_csv("greece_real.csv",index_col=0)

In [277]:
len(iceland["name"].value_counts())

84

In [278]:
len(greece["name"].value_counts())

70

In [274]:
iceland["reviews_clear"] = iceland["reviews_clear"].apply(lambda x : np.str(x))

In [406]:
greece["reviews_clear"] = greece["reviews_clear"].apply(lambda x : np.str(x))

In [76]:
cbow = Word2Vec.load('cbow_iceland.model')

In [396]:
skip = Word2Vec.load('skip_greece.model')

## 算向量

In [401]:
def vector_similarity(s1, s2):
    def sentence_vector(s):
        v = np.zeros(1000)
        for word in word_tokenize(s):
            try:
                v += cbow.wv[word.lower()]
            except:
                continue
        v /= len(s)
        return v

    v1, v2 = sentence_vector(s1), sentence_vector(s2)
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [408]:
def vector_similarity2(s1, s2):
    def sentence_vector(s):
        v = np.zeros(1000)
        for word in word_tokenize(s):
            try:
                v += skip.wv[word.lower()]
            except:
                continue
        v /= len(s)
        return v

    v1, v2 = sentence_vector(s1), sentence_vector(s2)
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [405]:
vector_similarity2("hotel", "room")

0.12808228886087877

In [415]:
vector_similarity2("hotel", re.sub("\n","",greece.loc[(greece.sentiment==1)&(greece["name"]=="Metropolis Hotel（大都市酒店）"),:].translate_reviews[147]))

0.26757553085193997

## 特徵字

In [409]:
token_re = RegexpTokenizer(r'[a-zA-Z0-9]+')

def keyword_20_1(text):
    tfc = TfidfVectorizer(max_features=20,stop_words="english",max_df=0.3,ngram_range = (1,1),tokenizer = token_re.tokenize,smooth_idf=True, norm='l2')
    text_tfc= tfc.fit_transform(text)
    
    return tfc.vocabulary_.keys()

def keyword_20_2(text):
    tfc = TfidfVectorizer(max_features=20,stop_words="english",max_df=0.3,ngram_range = (2,2),tokenizer = token_re.tokenize,smooth_idf=True, norm='l2')
    text_tfc= tfc.fit_transform(text)
    
    return tfc.vocabulary_.keys()

def keyword_50_1(text):
    tfc = TfidfVectorizer(max_features=50,stop_words="english",max_df=0.3,ngram_range = (1,1),tokenizer = token_re.tokenize,smooth_idf=True, norm='l2')
    text_tfc= tfc.fit_transform(text)
    
    return tfc.vocabulary_.keys()

def keyword_50_2(text):
    tfc = TfidfVectorizer(max_features=50,stop_words="english",max_df=0.3,ngram_range = (2,2),tokenizer = token_re.tokenize,smooth_idf=True, norm='l2')
    text_tfc= tfc.fit_transform(text)
    
    return tfc.vocabulary_.keys()

def keyword_50_23(text):
    tfc = TfidfVectorizer(max_features=50,stop_words="english",max_df=0.3,ngram_range = (2,3),tokenizer = token_re.tokenize,smooth_idf=True, norm='l2')
    text_tfc= tfc.fit_transform(text)
    
    return tfc.vocabulary_.keys()

In [410]:
def hotel_keyword(country_data,hotel_name):
    
    df = country_data.loc[country_data["name"]==hotel_name,:].reviews_clear
    df_n = country_data.loc[(country_data["name"]==hotel_name)&(country_data.sentiment==1),:].reviews_clear
    df_p = country_data.loc[(country_data["name"]==hotel_name)&(country_data.sentiment==0),:].reviews_clear

    def cond_entropy(word):
        
        count = 0
        for text in df:
            if word in str(text):
                count+=1
                
        count_1 = 0
        for text in df_n:
            if word in str(text):
                count_1+=1
        
        count_2 = 0
        for text in df_p:
            if word in str(text):
                count_2+=1

        n1 = len(df_n)
        n2 = len(df_p)

        p1 = (count_1/n1)/(count_1/n1+count_2/n2)
        p2 = (count_2/n2)/(count_1/n1+count_2/n2)

        ans =  abs (- (p1 * math.log(p1+0.0000000001,2) + p2 * math.log(p2+0.0000000001,2)))

        return ans

    mean_keypoint = {}
    for i in keyword_50_23(df_n):
        try:
            mean_keypoint.update({i:round(cond_entropy((i)),5)})
        except:
            continue

    sorted(mean_keypoint.items(), key = lambda x : x[1] )


    return list(dict(sorted(mean_keypoint.items(), key = lambda x : x[1])).keys())[:30]

    

## 每個旅館得分

In [416]:
def Recommended_hotel(hotel_list,negative_reviews):

    hotel_score = {}
    for name in hotel_list:

        try:
            strings = hotel_keyword(greece,name)
            target = negative_reviews 

            score = {}
            for string in strings:
                score.update({string:vector_similarity2(string, target)})

            score_mean = np.mean([i[1] for i in list(sorted(score.items(), key = lambda x : x[1],reverse=True))[:5]])

            hotel_score.update({name:score_mean})
        except:
            continue
        
    return sorted(hotel_score.items(), key = lambda x : x[1] )

### 冰島

In [381]:
re.sub("\n","",iceland.loc[(iceland.sentiment==1)&(iceland["name"]=="Hotel Kjarnalundur（賈爾納倫杜酒店）"),:].translate_reviews[7968])

'When we arrived they gave us two rooms. One of them was not cleaned from the previous client. After that, we complained to the night swift receptionist and she gave us another room which was underground. The second room was totally different from the first one, there were two single beds instead of one double ( which we asked when we booked the rooms), the room was full of flies and the wifi connection was weak! !  The worst part of our experience was the rude behaviour of the receptionist. When we complained about the room condition she told us "go away". Unfortunately the behaviour was totally unprofessional. I strongly not recommend the hotel not for the flies neither for the dirty room as mistakes happen  but for the rude behavior.'

In [382]:
n_list = ["The room is dirty","The hotel is very good and highly recommended","Bad wifi signal","There is a rude receptionist","The location of this hotel is good","The weather today is good","Breakfast is really bad"]

score = {}
for string in n_list:
    print(vector_similarity(string, iceland.loc[(iceland.sentiment==1)&(iceland["name"]=="Hotel Kjarnalundur（賈爾納倫杜酒店）"),:].translate_reviews[7968]))


0.23143411523420696
-0.15019424415830568
0.06375268754530987
0.28184846590156454
-0.021777759775452717
-0.07548941166533191
-0.07792186026741661


In [383]:
Recommended_hotel(iceland["name"].value_counts().keys(),re.sub("\n","",iceland.loc[(iceland.sentiment==1)&(iceland["name"]=="Hotel Kjarnalundur（賈爾納倫杜酒店）"),:].translate_reviews[7968]))

  return np.dot(v1, v2) / (norm(v1) * norm(v2))


[('Hótel Kría（科里亞酒店）', 0.05993370986338185),
 ('Eyja Guldsmeden Hotel（艾加古斯米登酒店）', 0.06719243757300408),
 ('Midgardur by Center Hotels', 0.09090616478287031),
 ('Reykjavik Residence Apartment Hotel（雷克雅未克公寓酒店）', 0.09829398531618319),
 ('Icelandair Hotel Myvatn（米湖冰島之空酒店）', 0.09986855384373382),
 ('Hali Country Hotel（哈利鄉村酒店）', 0.10310890344094152),
 ('Fosshotel Glacier Lagoon（冰河潟湖福斯酒店）', 0.10336931477076339),
 ('Eric the Red Guesthouse（埃里克紅旅館）', 0.10487298258358058),
 ('Guesthouse Carina（卡瑞納旅館）', 0.1100183986846102),
 ('Lilja Guesthouse（麗杰旅館）', 0.11315874837534427),
 ('BB Hótel by Reykjavik Keflavik Airport', 0.12934515398824545),
 ('The Barn', 0.15821502679147503),
 ('ODDSSON Hotel', 0.16226036328651017),
 ('Aurora Hotel at Reykjavik-Keflavik Airport Terminal KEF',
  0.16860603701959714),
 ('Center Hotels Plaza', nan),
 ('Nupan Deluxe（豪華努潘住宿加早餐旅館）', 0.09645825947041427),
 ('Hótel Laxá（拉夏酒店）', 0.11497367970406266),
 ('Icelandair Hotel Reykjavik Natura（雷克雅未克納圖拉冰島航空酒店）', 0.1208440093314759),

### 希臘

In [412]:
greece.loc[(greece.sentiment==1)&(greece["name"]=="Metropolis Hotel（大都市酒店）"),:].translate_reviews[147]

'Clean Very small rooms and shower room \nFamily room with 3 beds given so had even less room. No breakfast due to covid and staff shortage'

In [413]:
re.sub("\n","",greece.loc[(greece.sentiment==1)&(greece["name"]=="Metropolis Hotel（大都市酒店）"),:].translate_reviews[147])

'Clean Very small rooms and shower room Family room with 3 beds given so had even less room. No breakfast due to covid and staff shortage'

In [417]:
Recommended_hotel(greece["name"].value_counts().keys(),re.sub("\n","",greece.loc[(greece.sentiment==1)&(greece["name"]=="Metropolis Hotel（大都市酒店）"),:].translate_reviews[147]))

  return np.dot(v1, v2) / (norm(v1) * norm(v2))


[('Athens Hawks', 0.2751253206337004),
 ('Athinaiko Hotel（阿西恩尼克酒店）', 0.2831278183561404),
 ('The Athens Gate Hotel（雅典門酒店）', 0.2842053500925553),
 ('Minoa Athens Hotel（雅典彌諾阿酒店）', 0.29753424000272377),
 ('Centrotel Hotel（薩特洛泰酒店）', 0.2996051224202073),
 ('Babis Hotel（巴比酒店）', 0.30310625553214565),
 ('Evripides Hotel（艾弗瑞派德酒店）', 0.313341780411199),
 ('Hotel Fresh（弗萊士酒店）', 0.3223388182313867),
 ("Adam's Hotel（亞當酒店）", 0.32247320927331624),
 ('Infinity City Boutique Hotel（無限城精品酒店）', 0.3274004804878092),
 ('Metropolis Hotel（大都市酒店）', 0.3367421323613418),
 ('Athens Studios（雅典一室公寓）', 0.3426597255604416),
 ('Cecil Hotel（塞西爾酒店）', 0.3451081573811564),
 ('Plaka Hotel（普拉卡酒店）', 0.349030467561228),
 ('Nea Metropolis（尼梅特波利斯酒店）', 0.3490803931783789),
 ('Hotel London（倫敦酒店）', 0.3503594087269247),
 ('Hotel Lozenge（菱形酒店）', 0.35267514679439105),
 ('Moxy Patra Marina', 0.356414215986862),
 ('Athens21', 0.3576697261582492),
 ('Attalos Hotel（阿塔洛斯酒店）', 0.35786931368292696),
 ('Amazon Hotel', 0.3628661567922961),
 ('

In [395]:
re.sub("\n","",greece.loc[(greece.sentiment==1)&(greece["name"]=="Metropolis Hotel（大都市酒店）"),:].translate_reviews[147])

'Clean Very small rooms and shower room Family room with 3 beds given so had even less room. No breakfast due to covid and staff shortage'