# Import libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the datasets and merge them. 
## Only keep relevant columns

In [2]:
df = pd.read_csv('reviews.csv')
safedf = df
df.drop(columns=["id"],inplace=True)

In [3]:
df_listings = pd.read_csv('listings.csv')
df_listings = df_listings[["id",'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']]

In [4]:
df = pd.merge(df,df_listings,'left',left_on='listing_id',right_on= 'id',copy=False)
df.drop(columns=["id"],inplace=True)

## Check for missing values in 'comments' and 'review_scores_rating'. Drop them. Imputing is not reasonable in this case.

In [5]:
df.isna().sum()

listing_id                       0
date                             0
reviewer_id                      0
reviewer_name                    0
comments                       437
number_of_reviews                0
number_of_reviews_ltm            0
number_of_reviews_l30d           0
first_review                     0
last_review                      0
review_scores_rating             0
review_scores_accuracy         194
review_scores_cleanliness      191
review_scores_checkin          198
review_scores_communication    193
review_scores_location         203
review_scores_value            204
dtype: int64

In [6]:
df.dropna(axis=0, subset=["comments"],inplace=True)

# Preprocess the comments for the NLP process.

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hamue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Transform text into lower case.
## Keep only letters and numbers.
## Stem the words.
## Store the result in a list format.

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = PorterStemmer()    #plug in here any other stemmer or lemmatiser you want to try out


def preprocess(raw_text):
    
    #regular expression keeping only letters 
    letnum_text =  re.sub("[^a-zA-Z0-9\s]+", " ",raw_text )

    # convert to lower case and split into words -> convert string into list ( 'hello world' -> ['hello', 'world'])
    words = letnum_text.lower().split()

    cleaned_words = []
    
    
    # remove stopwords    
    
    cleaned_words = [w for w in words if not w.lower() in stop_words]
    
    # stemm or lemmatise words
    stemmed_words = []
    for word in cleaned_words:
        word = lemmatizer.stem(word)   #dont forget to change stem to lemmatize if you are using a lemmatizer
        stemmed_words.append(word)
    
    # converting list back to string
    #return " ".join(stemmed_words)
    return stemmed_words

In [10]:
df["comments_processed"] = df.comments.apply(preprocess)
comments= df["comments_processed"].to_list()

## Save the resulting csv to reduce processing time when restarting.

df.to_csv("reviews_processed.csv",index=False)

# NLP modelling: Build the Word2Vec
## Save the words and their embeddings to reduce computation time when restarting.

In [15]:
import gensim
from gensim.models import KeyedVectors



SIZE = 100
WINDOW = 5
WORKERS = 4
MIN_COUNT = 2
# train word2vec model
model = gensim.models.Word2Vec(sentences=comments, size=SIZE, window=WINDOW, workers=WORKERS, min_count=MIN_COUNT)
# vocab size
words = list(model.wv.vocab)
print ('Vocabulary size: %d' % len(words))


# Store just the words + their trained embeddings.

word_vectors = model.wv

word_vectors.save("word2vec.wordvectors")

In [88]:
conda install -c conda-forge gensim

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\hamue\anaconda3

  added / updated specs:
    - gensim


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.11.0               |   py39hcbf5309_2        16.9 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        16.9 MB

The following packages will be UPDATED:

  conda              pkgs/main::conda-4.11.0-py39haa95532_0 --> conda-forge::conda-4.11.0-py39hcbf5309_2



Downloading and Extracting Packages

conda-4.11.0         | 16.9 MB   |            |   0% 
conda-4.11.0         | 16.9 MB   |            |   0% 
conda-4.11.0         | 16.9 MB   | #4         |  14% 
conda-4.11.0         | 16.9 MB   | ##8        |  29% 
conda-4.11.0         |

# Load the saved files; csv and Word2Vec dict.

In [None]:
df = pd.read_csv("reviews_processed.csv")

In [16]:
# Load back with memory-mapping = read-only, shared across processes.

wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

# Data processing pt.2 - Prepare the comments for ML model
## Remove words which are not in the Word2Vec dict.
## Calculate the mean vector for every comment (Interpretability?)

In [13]:
def RemoveUnknownWords(text):
    
    cleaned_text = []
    for w in text:
        if w in dict.keys(wv.vocab):
            cleaned_text.append(w)        
    return cleaned_text
    

In [17]:
df["comments_processed_2"] = df["comments_processed"].apply(RemoveUnknownWords)

In [25]:
def CalculateMean(text_list):
    vec = np.zeros(100).reshape((1, 100))
    count = 0    
    for word in text_list:
        try:
            vec += wv[word].reshape((1, 100))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
        
    return vec

In [28]:
df["comments_vector"]=df["comments_processed_2"].apply(CalculateMean)

## Flatten the lists to avoid nested lists.

In [66]:
df["comments_vector"] = [val for sublist in df["comments_vector"] for val in sublist]


ValueError: Length of values (56216000) does not match length of index (562160)

In [91]:
df_agg = df.groupby('listing_id')['comments_vector'].apply(np.mean)

In [99]:
df_agg = pd.merge(df_agg,df_listings[['id','review_scores_rating']],'left',left_on='listing_id',right_on= 'id',copy=False)

## Transform the ratings into categories

In [101]:
lower_bound= df_agg["review_scores_rating"].quantile(0.25)
upper_bound= df_agg["review_scores_rating"].quantile(0.75)

In [105]:
def CategorizeRating(rating,lower_bound,upper_bound):
    if rating >=upper_bound:
        cat = "good"
    #elif rating >=lower_bound:
        #cat = "medium"
    else:
        cat = "bad"
    return cat

# ML model: Predict categorical rating based on comments

In [106]:
from sklearn.model_selection import train_test_split

X = df_agg["comments_vector"].to_list()
y = df_agg["review_scores_rating"].apply(lambda x: CategorizeRating(x,lower_bound, upper_bound))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)
clf.score(X_test,y_test)

0.7565591397849463