## Unsupervised learning & Recommendation System

The main idea behind unsupervised learning is that you don’t give any previous assumptions and definitions to the model about the outcome of variables you feed into it — you simply insert the data (of course preprocessed before), and want the model to learn the structure of the data itself.

In this case, because every customer's review does not have the rating, that's why we are going to build an unsupervised learning model.

In [14]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from time import time 

## Data Processing

In [50]:
reviews= pd.read_csv('reviews.csv')

In [51]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [52]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text  

In [53]:
reviews.drop(['reviewer_name','date'], axis=1,inplace=True)

In [54]:
reviews.comments = reviews.comments.apply(lambda x: text_to_word_list(x))

## Word2Vec

In [55]:
reviews_model = reviews.copy()

In [56]:
reviews_model = reviews_model[reviews_model.comments.str.len()>1]

In [57]:
sent = [row for row in reviews_model.comments]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]

['great',
 'location',
 'for',
 'both',
 'airport',
 'and',
 'city',
 'great',
 'amenities',
 'in',
 'the',
 'house',
 'plus',
 'islam',
 'was',
 'always',
 'very',
 'helpful',
 'even_though',
 'he',
 'was',
 'away']

In [58]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.27 mins


In [62]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

Time to train the model: 5.89 mins


In [63]:
w2v_model.save("word2vec.model")

## K-means Clustering

In [64]:
word_vectors = Word2Vec.load('word2vec.model').wv

In [65]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [66]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('serious_cleaning', 0.8859721422195435),
 ('touristy_spots', 0.8813823461532593),
 ('maintained_daily', 0.879956841468811),
 ('jazmyne', 0.8755927681922913),
 ('margarita', 0.875381350517273),
 ('square_footage', 0.8750718832015991),
 ('aiming', 0.8669908046722412),
 ('becouse', 0.8666425347328186),
 ('quick_stopover', 0.8605995178222656),
 ('go_freely', 0.8604874610900879)]

In [67]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [68]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [69]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [70]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [73]:
reviews_output = reviews_model.copy()
reviews_output.comments = reviews_output.comments.apply(lambda x: ' '.join(bigram[x]))

In [74]:
reviews_output.head()

Unnamed: 0,listing_id,id,reviewer_id,comments
0,1178162,4724140,4298113,my stay at islam s place was really cool ! goo...
1,1178162,4869189,6452964,great location for both airport and city great...
2,1178162,5003196,6449554,we really_enjoyed our stay at_islams house fro...
3,1178162,5150351,2215611,the room was nice and clean and so were the co...
4,1178162,5171140,6848427,great location just 5_mins walk from the airpo...


In [75]:
reviews_output.to_csv('review_last.csv',index=False)

## Prediction

In [76]:
final_file = pd.read_csv('review_last.csv')

In [77]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(final_file.comments)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(final_file.comments)

In [78]:
def create_tfidf_dictionary(x, transformed_file, features):
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.comments.split()))

In [79]:
replaced_tfidf_scores = final_file.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [80]:
def replace_sentiment_words(word, sentiment_dict):
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [81]:
replaced_closeness_scores = final_file.comments.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [82]:
replacement_df = pd.DataFrame(data=[final_file.listing_id,final_file.id,final_file.reviewer_id,final_file.comments,replaced_closeness_scores, replaced_tfidf_scores ]).T
replacement_df.columns = ['listing_id','id','reviewer_id','comments','sentiment_coeff', 'tfidf_scores']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [83]:
replacement_df.head()

Unnamed: 0,listing_id,id,reviewer_id,comments,sentiment_coeff,tfidf_scores,sentiment_rate,prediction
0,1178162,4724140,4298113,my stay at islam s place was really cool ! goo...,"[1.0337125022087763, 1.082421966999573, 1.0456...","[2.739815972569313, 1.9027477633752095, 2.3949...",221.928332,1
1,1178162,4869189,6452964,great location for both airport and city great...,"[1.0548826082965892, 1.0478655139351276, 1.048...","[3.663483429427597, 2.1612853767894076, 1.8429...",78.096883,1
2,1178162,5003196,6449554,we really_enjoyed our stay at_islams house fro...,"[1.0387367460304435, 1.0662849657623812, 1.029...","[3.856259860314039, 4.848497120190661, 4.93764...",423.292134,1
3,1178162,5150351,2215611,the room was nice and clean and so were the co...,"[1.0450771403570074, 1.042101691305315, 1.0659...","[3.69634184583942, 2.524888895273744, 1.361725...",142.634225,1
4,1178162,5171140,6848427,great location just 5_mins walk from the airpo...,"[1.0548826082965892, 1.0478655139351276, 1.026...","[1.8317417147137984, 2.1612853767894076, 6.154...",78.088891,1


In [84]:
# replacement_df.to_csv('UnsupervisedSentimentAnalysis.csv')

## User-based collaborative recommendation

In [308]:
review_score = replacement_df.copy(deep = True)
review_score  = review_score[['listing_id','reviewer_id','sentiment_rate']]
review_score.head()

Unnamed: 0,listing_id,reviewer_id,sentiment_rate
0,1178162,4298113,221.928332
1,1178162,6452964,78.096883
2,1178162,6449554,423.292134
3,1178162,2215611,142.634225
4,1178162,6848427,78.088891


In [309]:
listing_id = review_score['listing_id'].drop_duplicates()
len(listing_id)

2827

In [310]:
user_id = review_score['reviewer_id'].drop_duplicates()
len(user_id)

63367

Because ther are too many users and properties in our review_score table, so we just choose those user who has more than 5 records in the dataset, otherwise the data might be too sparse.

In [312]:
# find users who has more than 5 reviews
a = review_score.groupby('reviewer_id').agg({'reviewer_id':'count'})
a = a[a['reviewer_id']>5]
a.head()

Unnamed: 0_level_0,reviewer_id
reviewer_id,Unnamed: 1_level_1
51538,10
114538,6
1089634,11
1399007,6
1695789,8


In [313]:
# get these users's data in review_score table
users = a.index
df = pd.DataFrame()
for i in range (0,67):
    r = review_score[review_score['reviewer_id']== users[i]]
    df = pd.concat([df,r], ignore_index = True)

In [314]:
review_score = df.copy(deep = True)
review_score.shape

(628, 3)

In [315]:
listing_id = review_score['listing_id'].drop_duplicates()
len(listing_id)

278

In [316]:
user_id = review_score['reviewer_id'].drop_duplicates()
len(user_id)

67

In [317]:
# build a null table whose index is users' is and columns are properties' id
df_user_listing = pd.DataFrame(np.zeros((67,278)), index = user_id , columns = listing_id)
df_user_listing.head()

listing_id,5729845,4000384,6513924,4331214,3392423,3693850,31796,1544702,1391215,4090224,...,7181950,11223924,6134145,2898226,4573388,14843783,3615760,7086825,7880828,2843445
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1089634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1399007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1695789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [318]:
#find out the matched sentiment rate for each pair of (user_id, listing_id) and fill it into the table
columns = 278
rows = 67

for i in range(0,rows):

    for j in range(0,columns):

        f = review_score[(review_score['reviewer_id']== df_user_listing.index[i]) & (review_score['listing_id'] == df_user_listing.columns[j])]
        if  f.shape[0] != 0:
            df_user_listing.iloc[i,j] = f['sentiment_rate'].iloc[0]

df_user_listing.head()

listing_id,5729845,4000384,6513924,4331214,3392423,3693850,31796,1544702,1391215,4090224,...,7181950,11223924,6134145,2898226,4573388,14843783,3615760,7086825,7880828,2843445
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51538,58.385623,478.422061,538.146928,49.615487,225.045388,133.642223,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114538,0.0,0.0,0.0,0.0,0.0,0.0,6.890452,44.184787,53.623242,213.883931,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1089634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1399007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1695789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Cosine similarity measures the size of the difference between two individuals by using the cosine value of two vector angles in the vector space. When the cosine value is closer to 1，the two vectors are more similar.

</br >
<center>
<img src="cosine_similar.png"width=200/>
</center>

In [319]:
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(df_user_listing,dense_output=True)
user_similarity

array([[1.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 1.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 1.       , 0.4362887,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.4362887, 1.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        1.       ]])

In [320]:
user_similarity = pd.DataFrame(user_similarity, index = user_id, columns = user_id )
user_similarity.head()

reviewer_id,51538,114538,1089634,1399007,1695789,1812574,1911807,3230853,3751402,4491477,...,44445333,46024609,46567743,48896613,52667131,56146959,61606767,61663856,63239764,77129698
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51538,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114538,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043669,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1089634,0.0,0.0,1.0,0.0,0.070807,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1399007,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1695789,0.0,0.0,0.070807,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [343]:
def find_top5_similar_users(user_id, user_similarity):
    similarity = user_similarity.loc[user_id]
    similarity = similarity.T.sort_values(ascending= False)
    top5 = similarity.iloc[1:6]
    return top5

In [347]:
# use user 51538 as example
# top 5 most similar users
similar_users = find_top5_similar_users(51538,user_similarity).index

#get the properties id that each similmar user has reivewed and also has positive rate
user_1 = review_score['listing_id'][(review_score['reviewer_id']== similar_users[0])&(review_score['sentiment_rate']>0)].tolist()
user_2 = review_score['listing_id'][(review_score['reviewer_id']== similar_users[1])&(review_score['sentiment_rate']>0)].tolist()
user_3 = review_score['listing_id'][(review_score['reviewer_id']== similar_users[2])&(review_score['sentiment_rate']>0)].tolist()
user_4 = review_score['listing_id'][(review_score['reviewer_id']== similar_users[3])&(review_score['sentiment_rate']>0)].tolist()
user_5 = review_score['listing_id'][(review_score['reviewer_id']== similar_users[4])&(review_score['sentiment_rate']>0)].tolist()

# get a union of similar users' recommended properties
union = user_1 + user_2 + user_3 + user_4 + user_5

In [350]:
#get the properties id that inputted user has reivewed
user = review_score['listing_id'][review_score['reviewer_id']== 51538].values

# get the difference set between the union and inputted user
# contains the properties that inputted user hasn't live and recommended by his similar users
rest = list(set(union).difference(user))
rest = pd.Series(rest)

# recommended properties's id and recommended times
rest.value_counts().sort_values()

6765855     1
3703674     1
2378421     1
7988755     1
6347026     1
7823025     1
1067184     1
12309083    1
8193344     1
4530670     1
11397511    1
4583526     1
1391492     1
750438      1
8067585     1
4223387     1
7956634     1
2564544     1
dtype: int64