In [None]:
"""
This file contains the code used to analysis reviews for property listings in Istanbul.
"""

In [None]:
# import libraries
import pandas as pd; import numpy as np; import matplotlib.pyplot as plt; import seaborn as sns; import re;
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords; from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from scipy.sparse import coo_matrix
from collections import Counter
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import multiprocessing
from time import time

%matplotlib inline

In [None]:
# read in data
df = pd.read_csv("../Data/DataPrepQ2.csv", low_memory=False)

In [None]:
# view df at high level
df.head()

In [None]:
df.info()

In [None]:
# remove rows with empty comments
df = df.dropna(subset=['comments'], axis=0)

In [None]:
# clean the reviewer comments
def clean_comment(comment):
    """
    Desc: cleans text in review comments by various means, including removing stopwords, punctuation, numbers and converting to lowercase.
    Args: comment -- String
    Output: comment_cleaned -- String
    """
    comment = re.sub(r"[^a-zA-Z ]", "", comment.lower())
    comment_tokens = word_tokenize(comment)
    stops = set(stopwords.words('english'))
    comment_cleaned = ' '.join(word for word in comment_tokens if word not in stops)
    return comment_cleaned


In [None]:
# clean review comments and store in a list
clean_comments = []
for i in df.comments:
    clean_comments.append(clean_comment(i))

In [None]:
# new feature containing cleaned comments
df['comment_cleaned'] = clean_comments

In [None]:
# join all comments together for wordcloud
comments = ' '.join(comment for comment in clean_comments)
comments_tokenized = word_tokenize(comments)
counter = Counter(comments_tokenized)

In [None]:
# top 10 mos common terms in review comment corpus
counter.most_common(10)

In [None]:
# include all terms for wordcloud except istanbul
comments = ' '.join(word for word in comments_tokenized if word not in ['istanbul'])

In [None]:
# create wordcloud and plot
wc = WordCloud(background_color='white').generate(comments)
plt.figure(figsize=(18,6))
plt.imshow(wc, interpolation='bilinear');
plt.axis("off");
plt.savefig("../Static/wc_1.png")

In [None]:
# plot wordcloud for kadikoy neighbourhood
kadikoy_comments = ' '.join(comment for comment in df.loc[df.neighbourhood_cleansed == 'Kadikoy', 'comments'].tolist())
kadikoy_wc = WordCloud(background_color='white').generate(kadikoy_comments)
plt.figure(figsize=(18,6))
plt.imshow(kadikoy_wc, interpolation='bilinear');
plt.axis("off");
plt.savefig("../Static/wc_2.png")

In [None]:
# plot wordcloud for fatih neighbourhood
fatih_comments = ' '.join(comment for comment in df.loc[df.neighbourhood_cleansed == 'Fatih', 'comment_cleaned'])
fatih_wc = WordCloud(background_color='white').generate(fatih_comments)
plt.figure(figsize=(18,6))
plt.imshow(fatih_wc, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.savefig("../Static/wc_3.png")

In [None]:
# assess comment quality by developing new features
df['comment_len'] = df.comment_cleaned.str.split().str.len()
df['comment_unique_len'] = df.comment_cleaned.apply(lambda x: len(set(x.split())))
df['comment_quality'] = df.comment_unique_len / df.comment_len

In [None]:
# remove comments that had less than 3 words - these won't prove useful for word2vec model
df_1 = df.loc[df.comment_len > 3].copy()

In [None]:
# convert comments to list of lists for preparation for phraser
comments = [word_tokenize(comment) for comment in df_1.comment_cleaned]

In [None]:
# find phrases in documents
phrases = Phrases(comments, min_count=30)

In [None]:
# build bigram phraser
bigrams = Phraser(phrases)

In [None]:
# transform corpus by identifying all relevant bigrams
comments_w_bigrams = bigrams[comments]

In [None]:
# identify number of cores available to train word2vec model
cores = multiprocessing.cpu_count()

In [None]:
# create w2v model with params initialized
#w2v_model = Word2Vec(sg=1, hs=1, min_count=50, window=2, size=300, sample=1e-5, alpha=0.03, min_alpha=0.0006, negative=20, workers=cores-1, iter=3, seed=37)
# build the vocab for w2v model
#t = time()

#w2v_model.build_vocab(comments_w_bigrams, progress_per=10000)

#print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
# train the w2v model on the comment data
#t = time()

#w2v_model.train(comments_w_bigrams, total_examples=w2v_model.corpus_count, epochs=10)

#print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# make the model memory efficient after training
#w2v_model.init_sims(replace=True)

In [None]:
# load the pre-trained w2v model
w2v_model = Word2Vec.load("../Models/word_2_vec_comments.model")

In [None]:
# check the results of the model
w2v_model.wv.most_similar(positive=['great'])

In [None]:
w2v_model.wv.most_similar(positive=['ferry'])

In [None]:
w2v_model.wv.most_similar(positive=["culture", "food"], negative=["location"], topn=3)

In [None]:
# persist the w2v model
#w2v_model.save("../Models/word_2_vec_comments.model")

In [None]:
# apply kmeans model to word vectors to identify langauge used to write reviews
kmeans_model = KMeans(n_clusters=4, n_init=20, max_iter=1000, random_state=37)

In [None]:
# fit kmeans model
cluster_comments = kmeans_model.fit(w2v_model.wv.vectors)

In [None]:
# view results of kmeans 
w2v_model.wv.similar_by_vector(kmeans_model.cluster_centers_[3], topn=10, restrict_vocab=None)

In [None]:
# assign labels to words - first create new df for terms, vectors and cluster assignments
terms_df = pd.DataFrame(w2v_model.wv.vocab.keys())
terms_df.columns = ['term']
terms_df['vector'] = terms_df.term.apply(lambda x: w2v_model.wv[f"{x}"])
terms_df['cluster'] = terms_df.vector.apply(lambda x: kmeans_model.predict([np.array(x)]))
terms_df.cluster = terms_df.cluster.apply(lambda x: x[0])

In [None]:
# filter english terms
english_terms = set(terms_df.loc[terms_df.cluster == 1, 'term'].values)

In [None]:
# make new df showing comments
df_1['comment'] = [i for i in comments_w_bigrams]

In [None]:
# create new feature showing number of words in comment that are from english terms
df_1['language'] = df_1.comment.apply(lambda x: len([word for word in x if word in english_terms]))

In [None]:
# new df for english comments
english_df = df_1.loc[df_1.language > 0].copy()

In [None]:
# determine if comment has enough english words to be considered english
english_df.language = english_df.apply(lambda x: 'English' if x['language'] / len(x['comment']) >= .5 else 'Foreign', axis=1)

In [None]:
# filter on language column
english_df = english_df.loc[english_df.language == 'English']

In [None]:

w2v_model.wv.most_similar("turkish")

In [None]:
def restrict_w2v(w2v, restricted_word_set):
    """
    Desc: Restrict w2v model to a subset of terms. E.g. Restrict w2v model to only English terms. Function has been adapted from
          https://stackoverflow.com/questions/48941648/how-to-remove-a-word-completely-from-a-word2vec-model-in-gensim
    Args: w2v -- Gensim Word2Vec model
          restricted_word_set -- set or list
    Output: w2v -- Restricted Word2Vec model
    """
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for i in range(len(w2v.wv.vocab)):
        word = w2v.wv.index2entity[i]
        vec = w2v.wv.vectors[i]
        vocab = w2v.wv.vocab[word]
        vec_norm = w2v.wv.vectors_norm[i]
        if word in restricted_word_set:
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            new_vectors_norm.append(vec_norm)

    w2v.wv.vocab = new_vocab
    w2v.wv.vectors = np.array(new_vectors)
    w2v.wv.index2entity = np.array(new_index2entity)
    w2v.wv.index2word = np.array(new_index2entity)
    w2v.wv.vectors_norm = np.array(new_vectors_norm)
    
    return w2v

In [None]:
# create english w2v model
english_w2v = restrict_w2v(w2v_model, english_terms)

In [None]:
# create kmeans on english comments
english_kmeans = KMeans(n_clusters=13, n_init=20, max_iter=1000, random_state=37)
cluster_english = english_kmeans.fit(english_w2v.wv.vectors)

In [None]:
# check results of model
english_w2v.wv.similar_by_vector(english_kmeans.cluster_centers_[7], topn=20)

In [None]:
english_w2v.wv.similar_by_vector(english_kmeans.cluster_centers_[12], topn=20)

In [None]:
# create names for topics
topics_map = {0:'authentic, charming, beautiful neighbourhood', 1:'helpful host', 2: 'general host', 3: 'difficulties and things to be aware', 4: 'negatives',
             5: 'positive stay', 6: 'all about the views', 7: 'transport', 8: 'accommodation essentials', 9: 'proximity to tourist stuff', 10: 'good location',
             11:'turkish names', 12: 'food and drink'}

In [None]:
# create new df for clusters of english terms
english_terms_df = pd.DataFrame(english_w2v.wv.vocab.keys())
english_terms_df.columns = ['terms']
english_terms_df['vectors'] = english_terms_df.terms.apply(lambda x: english_w2v.wv[f'{x}'])
english_terms_df['cluster'] = english_terms_df.vectors.apply(lambda x: english_kmeans.predict([np.array(x)]))
english_terms_df.cluster = english_terms_df.cluster.apply(lambda x: x[0])
english_terms_df.cluster = english_terms_df.cluster.map(topics_map)

In [None]:
# create dictionary of terms and clusters
term_clusters = dict(zip(english_terms_df['terms'], english_terms_df['cluster']))

In [None]:
# create list of english terms to keep
term_keys = set(term_clusters.keys())

In [None]:
# for each comment, remove terms not recognised in english terms dictionary
for i in english_df.comment:
    for j in i:
        if j not in term_keys:
            i.remove(j)

In [None]:
# get the number of unique terms
max_features = pd.Series(' '.join(i for i in english_df.comment.str.join(' ')).split()).nunique()

In [None]:
# create tfidf model 
tfidf_vect = TfidfVectorizer(ngram_range=(1,1), norm='l2', sublinear_tf=True, smooth_idf=True, min_df=1, max_df=1., max_features=max_features)

In [None]:
# fit and transform tfidf model to english comments
tfidf_comments = tfidf_vect.fit_transform(english_df.comment.str.join(' '))

In [None]:
# store the feature names from tfidf
comment_features = tfidf_vect.get_feature_names()

In [None]:
def sort_coo(coo_matrix):
    """
    Desc: For each tfidf comment, store terms sorted based on their tfidf score. Function has been taken from
          https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    Args: coo_matrix -- SparseMatrix
    Output: tuples -- tuple
    """
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [None]:
def extract_topn_from_vector(feature_names, sorted_items, english_terms, topn=10):
    """
    Desc: Extact the top terms based on tfidf score from each comment, along wth their cluster assignment and tfidf score. Function adapted from
          https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    Args: feature_names -- list or set -- tfidf features
          sorted_items -- tuple -- tfidf comment indexes sorted by tfidf score
          english_terms -- set or list -- collection of terms considered as English
          topn: int -- number of top terms to return
    Output: results -- dict -- collection of top terms and their cluster assignment
    """
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    cluster_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        term = feature_names[idx]
        if term in english_terms:
            score_vals.append(round(score, 3))
            feature_vals.append(term)
            cluster_vals.append(term_clusters[term])
        
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=[score_vals[idx], cluster_vals[idx]]
    
    return results

In [None]:
# obtain tfidf comments sorted by term tfidf scores and store in list
sorted_items_list = []
for i in tfidf_comments:
    sorted_items_list.append(sort_coo(i.tocoo()))

In [None]:
# obtain top terms per comment and store in list
keywords_list = []
for i in sorted_items_list:
    keywords_list.append(extract_topn_from_vector(comment_features, i, english_terms, 10))

In [None]:
# extract cluster assigned to top terms from each comment
themes = []
for i in keywords_list:
    themes.append([j[1] for j in i.values()])

themes = [list(set(i)) for i in themes]

In [None]:
# add themes of comment to df
english_df['comment_themes'] = themes

In [None]:
# join cluster names together into a string
english_df.comment_themes = english_df.comment_themes.str.join(', ')

In [None]:
# remove redundant features
english_df = english_df.drop(['id_x', 'id_y'], axis=1)

In [None]:
# reset index
english_df = english_df.reset_index(drop=True)