Lesson M909



In [5]:
# Import necessary libraries
import re  
import pandas as pd  
import gensim
from gensim.models import Word2Vec
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
# Download necessary packages
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stop_words_set = set(stopwords.words("english"))

In [8]:
# The dataset was downloaded from Kaggle and can be found here: https://www.kaggle.com/andrewmvd/trip-advisor-hotel-reviews
# It contains 20k reviews crawled from Tripadvisor.

df = pd.read_csv('tripadvisor_hotel_reviews.csv')
print('Shape of initial dataframe:',df.shape)

Shape of initial dataframe: (20491, 2)


In [9]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [10]:
keys = df.keys() 
for key in keys:
    df_len = len(df[key].unique()) # the length of the unique values of each column
    print('{0:25}{1:10}'.format(key,df_len))

Review                        20491
Rating                            5


**DATA PRE-PROCESSING**

In [11]:
def data_cleaning(dataframe):
  
  print('\nData shape before removing missing values: ', dataframe.shape)
  print('\nWith missing values:')
  print(dataframe.isna().sum()) # check for missing values

  dataframe.dropna(inplace=True) # remove missing values and keep the dataFrame with valid entries in the same variable
  dataframe.reset_index(inplace=True,drop=True)

  print('\nWithout missing values:')
  print(dataframe.isna().sum())
  print('\nData shape after removing missing values: ', dataframe.shape)

  dataframe.duplicated() # check for duplicates
  return print('Number of duplicates in the dataframe:', dataframe.duplicated().sum())

In [12]:
def text_preprocessing(text):
                
        # Remove tags
        TAG_RE = re.compile(r'<[^>]+>')
        no_tags = TAG_RE.sub('',text)  
                
        # Remove unusual characters
        text = re.sub('<[^>]*>', '', no_tags)
    
        # Remove emoticons
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
                
        # Convert all words to lowercase
        text = re.sub('[\W]+', ' ', text.lower()) + " ".join(emoticons).replace('-', '')
    
        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Create tokens
        tokens = nltk.word_tokenize(text)

        # Remove punctuation
        no_punct = [word for word in tokens if word.isalpha()]
        
        # Remove stopwords
        words = [w for w in no_punct if not w in stop_words_set]
             
        return words

In [13]:
data_cleaning(df)


Data shape before removing missing values:  (20491, 2)

With missing values:
Review    0
Rating    0
dtype: int64

Without missing values:
Review    0
Rating    0
dtype: int64

Data shape after removing missing values:  (20491, 2)
Number of duplicates in the dataframe: 0


In [14]:
# Apply the function to preprocess the texts in the reviews
df['Review'] = df['Review'].apply(text_preprocessing)

# Convert the dataframe into a list containing lists
corpus = df['Review'].tolist()
print('Length of corpus: ', len(corpus))
print(corpus[1])

Length of corpus:  20491
['ok', 'nothing', 'special', 'charge', 'diamond', 'member', 'hilton', 'decided', 'chain', 'shot', 'th', 'anniversary', 'seattle', 'start', 'booked', 'suite', 'paid', 'extra', 'website', 'description', 'suite', 'bedroom', 'bathroom', 'standard', 'hotel', 'room', 'took', 'printed', 'reservation', 'desk', 'showed', 'said', 'things', 'like', 'tv', 'couch', 'ect', 'desk', 'clerk', 'told', 'oh', 'mixed', 'suites', 'description', 'kimpton', 'website', 'sorry', 'free', 'breakfast', 'got', 'kidding', 'embassy', 'suits', 'sitting', 'room', 'bathroom', 'bedroom', 'unlike', 'kimpton', 'calls', 'suite', 'day', 'stay', 'offer', 'correct', 'false', 'advertising', 'send', 'kimpton', 'preferred', 'guest', 'website', 'email', 'asking', 'failure', 'provide', 'suite', 'advertised', 'website', 'reservation', 'description', 'furnished', 'hard', 'copy', 'reservation', 'printout', 'website', 'desk', 'manager', 'duty', 'reply', 'solution', 'send', 'email', 'trip', 'guest', 'survey', 'f

**WORD2VEC MODEL 1**

**Set model parameters**

In [15]:
model1 = gensim.models.Word2Vec(size= 100, window = 3, min_count = 2, sg = 1, workers = 10, hs = 0, negative = 5)

# Size of dimensionality of word vectors: 100
# Window, the maximum distance between the target word and its neighboring word: 3
# Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 2
# Sg, training algorithm: Skip-Gram
# The number of worker threads used to train the model: 10
# 0: Negative sampling
# Number of negative samples: 5

**Build the vocabulary**

In [16]:
model1.build_vocab(corpus)

In [17]:
# Get the list of words in the vocabulary
words = model1.wv.vocab
print('Total number of unique words loaded in Model : ', len(model1.wv.vocab))

# Create a corresponding list of the count for each word
word_ranks = []
for word in words:
    word_ranks.append(model1.wv.vocab[word].count)
    
# Sort both lists by the word counts, descending (most frequent first)
word_ranks, words = map(list, zip(*sorted(zip(word_ranks, words), reverse=True)))

print('\nThe 30 most frequest words:\n')
print('    ---Rank---  ---Word---')

# For the 30 most frequent words
for i in range(30):
    # Print the rank with commas and pad it to 12 characters.
    print('{:>12,}     {:}'.format(word_ranks[i], words[i]))

print('\nThe 30 least frequest words:\n')
print('    ---Rank---   ---Word---')

# Go backwards through the last 10 indeces
for i in range(-1, -30, -1):
    # Print the count with commas and pad it to 12 characters.
    print('{:>12,}     {:}'.format(word_ranks[i], words[i]))

Total number of unique words loaded in Model :  25037

The 30 most frequest words:

    ---Rank---  ---Word---
      49,823     hotel
      35,341     room
      21,477     great
      19,102     n
      17,417     good
      16,637     staff
      15,413     stay
      12,644     nice
      12,406     rooms
      11,357     location
      10,502     stayed
      10,371     service
      10,157     night
      10,122     time
      10,065     beach
       9,976     day
       9,738     breakfast
       9,597     clean
       9,415     food
       8,254     like
       8,141     resort
       7,791     place
       7,790     really
       7,578     pool
       6,893     friendly
       6,839     people
       6,595     small
       6,260     little
       6,255     walk
       6,206     got

The 30 least frequest words:

    ---Rank---   ---Word---
           2     aahh
           2     aand
           2     aas
           2     abandon
           2     abandoning
           2     abeau

**Train the Word2Vec model**

In [18]:
model1.train(sentences=corpus, total_examples=len(corpus), epochs=model1.iter)

  """Entry point for launching an IPython kernel.


(9138370, 10143600)

In [19]:
word_vectors = model1.wv
print('The vocabulary includes {} unique words.'.format(len(word_vectors.vocab)))
vector = model1.wv['nice']
print('The length of a word vector according to the parameter "size": ',len(vector))
print('Numpy vector of the word "nice":\n',vector)

The vocabulary includes 25037 unique words.
The length of a word vector according to the parameter "size":  100
Numpy vector of the word "nice":
 [-0.16406265  0.23530467 -0.29546687  0.17013428  0.26952535  0.12750848
  0.3784242  -0.3520841  -0.42471138  0.39322042  0.01476069  0.55951273
  0.1652967   0.3257624   0.12523626 -0.16914952 -0.13219458  0.7724101
 -0.11426571 -0.03218948 -0.26684275  0.32217103  0.40112337 -0.19855571
  0.21553007 -0.04242864  0.32383937 -0.0914438  -0.10170554 -0.14913952
  0.18517165  0.08040129 -0.19676238 -0.23305827  0.03726035 -0.10574959
  0.13444589  0.2228283  -0.08959322  0.0409127  -0.4797448  -0.32173702
 -0.10882694  0.09740351  0.14658293  0.2032278   0.5547828  -0.32607338
 -0.23868999  0.2789851   0.25150564  0.10932046 -0.28914598 -0.0990825
 -0.5564771   0.43575716 -0.3317961   0.30235058 -0.47585264 -0.28579777
  0.00349328 -0.17990397  0.03916128  0.09382967 -0.2797046   0.14110936
 -0.09691735 -0.07747447  0.36744773 -0.32765758  0.2

**Find the most similar words**

In [20]:
print('Most similar words for "expensive"\n')
model1.wv.most_similar('expensive')

Most similar words for "expensive"



[('pricey', 0.867865264415741),
 ('pricy', 0.8189133405685425),
 ('costly', 0.8106584548950195),
 ('overpriced', 0.8106163740158081),
 ('cheaper', 0.7656259536743164),
 ('pricier', 0.7643506526947021),
 ('costs', 0.7536859512329102),
 ('outrageous', 0.7531921863555908),
 ('inflated', 0.7527035474777222),
 ('priced', 0.7451817989349365)]

In [21]:
print('Most similar words for "great"\n')
model1.wv.most_similar('great')

Most similar words for "great"



[('terrific', 0.8998247385025024),
 ('fantastic', 0.8552649021148682),
 ('phenomenal', 0.8421623706817627),
 ('excellent', 0.8375718593597412),
 ('good', 0.8166176080703735),
 ('brilliant', 0.8117712736129761),
 ('excellant', 0.8039320707321167),
 ('awesome', 0.8012272119522095),
 ('wonderful', 0.7968956232070923),
 ('tremendous', 0.7966464757919312)]

In [22]:
print('Most similar words for "bad"\n')
model1.wv.most_similar('bad')

Most similar words for "bad"



[('horrible', 0.7663941383361816),
 ('terrible', 0.7481241226196289),
 ('sucks', 0.7375690937042236),
 ('dreadful', 0.7321875095367432),
 ('kidding', 0.7221542596817017),
 ('awful', 0.7106975317001343),
 ('lousy', 0.7034407258033752),
 ('darn', 0.698879599571228),
 ('critical', 0.697730302810669),
 ('complains', 0.6964166760444641)]

**Compare similarity of words**

In [23]:
# Similarity between two words
score1 = model1.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score1)

score2 = model1.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score2)

# Similarity between lists of words
print('Similarity between the lists: ',model1.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the lists: ',model1.wv.n_similarity(['beautiful','lovely'],['nice','great']))

Cosine similarity between "expensive" and "good" is: 0.50

Cosine similarity between "nice" and "good" is: 0.68

Similarity between the lists:  0.44613847

Similarity between the lists:  0.795597


  
  """


In [24]:
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model1.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model1.wv.distance('friendly','nice'))

Distance between "terrible" and "horrible":  0.11778444051742554
Distance between "friendly" and "nice":  0.4149606227874756


In [25]:
# Find words that do not match
print('Word that does not match with the rest: ',model1.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim1 = model1.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim1[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim2 = model1.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim2[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

Word that does not match with the rest:  service
The most similar word is "courteous": 0.6560
The most similar word is "horrible": 0.8822


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


**WORD2VEC MODEL 2 CBOW**

**Set model parameteres**

In [26]:
model2 = gensim.models.Word2Vec(size= 100, window = 3, min_count = 1, sg = 0, workers = 5, hs = 0, negative = 5)

# Size of dimensionality of word vectors: 100
# Window, the maximum distance between the target word and its neighboring word: 3
# Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 1
# Sg, training algorithm: CBOW
# The number of worker threads used to train the model: 5
# 0: Negative sampling
# Number of negative samples: 5

**Build the vocabulary**

In [27]:
model2.build_vocab(corpus)

**Train the second Word2Vec model**

In [28]:
model2.train(sentences=corpus, total_examples=len(corpus), epochs=model2.iter)

  """Entry point for launching an IPython kernel.


(9267278, 10143600)

In [29]:
word_vectors2 = model2.wv
print('The vocabulary includes {} unique words.'.format(len(word_vectors2.vocab)))
vector2 = model2.wv['nice']
print('The length of a word vector according to the parameter "size": ',len(vector2))
print('Numpy vector of the word "nice":\n',vector2)

The vocabulary includes 48959 unique words.
The length of a word vector according to the parameter "size":  100
Numpy vector of the word "nice":
 [ 0.09360968  0.14551876  1.0759455   1.7566912   1.5321697   1.1960722
  0.2295063  -1.4187635  -1.4068594  -0.26937306 -1.1585968  -0.06155514
  0.87846786  0.6588273  -0.31286204 -0.97868735 -0.26431593  0.9983019
  0.04461127 -0.07919632 -1.4657954   0.97029173 -0.56872743  0.08821569
 -0.7368466   0.20135616  0.41127786  0.21696232 -0.18049097  0.04047824
  0.751256   -0.5013633  -0.24166001 -0.4304393   1.0530143  -0.8651481
  0.16809657 -0.03175985  0.39211163 -0.47118062  0.11397498 -1.3893498
 -0.39323118 -0.1871048   0.63553977  1.7483108   2.4457295  -0.7415914
 -0.35399443  0.42490858  0.61088824  0.3313907  -0.23141788 -0.824675
 -0.21933584 -0.6052369  -2.4118636   0.76665354 -0.6808813  -0.92777485
 -0.30381238 -1.3019224   0.69423383  0.10692751 -1.0556427   1.2760147
 -0.63577783 -0.7631183   0.8720977  -1.0272728  -0.5324697

**Find the most similar words**

In [30]:
print('Most similar words for "expensive"\n')
model2.wv.most_similar('expensive')

Most similar words for "expensive"



[('pricey', 0.9085597991943359),
 ('overpriced', 0.8676849603652954),
 ('inexpensive', 0.760445773601532),
 ('cheaper', 0.7571960687637329),
 ('pricy', 0.7550462484359741),
 ('cheap', 0.7473670244216919),
 ('prices', 0.714169979095459),
 ('option', 0.7082355618476868),
 ('reasonable', 0.6970846652984619),
 ('fair', 0.6960881948471069)]

In [31]:
print('Most similar words for "great"\n')
model2.wv.most_similar('great')

Most similar words for "great"



[('fantastic', 0.891322135925293),
 ('excellent', 0.8648792505264282),
 ('terrific', 0.8608647584915161),
 ('brilliant', 0.8455147743225098),
 ('wonderful', 0.8205458521842957),
 ('awesome', 0.8165668249130249),
 ('fabulous', 0.8154141902923584),
 ('good', 0.8147371411323547),
 ('amazing', 0.7870365381240845),
 ('superb', 0.786649227142334)]

In [32]:
print('Most similar words for "bad"\n')
model2.wv.most_similar('bad')

Most similar words for "bad"



[('terrible', 0.7914304733276367),
 ('horrible', 0.768926739692688),
 ('poor', 0.7583184242248535),
 ('awful', 0.7541587352752686),
 ('ok', 0.7479483485221863),
 ('okay', 0.7346817255020142),
 ('negative', 0.7130692005157471),
 ('worse', 0.7113240957260132),
 ('complain', 0.7010729312896729),
 ('thats', 0.6883893609046936)]

**Compare similarity of words**

In [33]:
# Similarity between two words
score_1 = model2.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score_1)

score_2 = model2.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score_2)

# Similarity between lists of words
print('Similarity between the lists: ',model2.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the lists: ',model2.wv.n_similarity(['beautiful','lovely'],['nice','great']))

Cosine similarity between "expensive" and "good" is: 0.49

Cosine similarity between "nice" and "good" is: 0.60

Similarity between the lists:  0.2962033

Similarity between the lists:  0.7642074


  
  """


In [34]:
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model2.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model2.wv.distance('friendly','nice'))

Distance between "terrible" and "horrible":  0.048496782779693604
Distance between "friendly" and "nice":  0.418826699256897


In [35]:
# Find words that do not match
print('Word that does not match with the rest: ',model2.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim_1 = model2.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim_1[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim_2 = model2.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim_2[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

Word that does not match with the rest:  service
The most similar word is "pleasant": 0.7097
The most similar word is "horrible": 0.9515


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


**WORD2VEC MODEL 3 SKIP-GRAM**

**Set model parameters**

In [36]:
model3 = gensim.models.Word2Vec(size= 50, window = 3, min_count = 2, sg = 1, workers = 15, hs = 0, negative = 5)

# Size of dimensionality of word vectors: 50
# Window, the maximum distance between the target word and its neighboring word: 3
# Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 2
# Sg, training algorithm: Skip-Gram
# The number of worker threads used to train the model: 15
# 0: Negative sampling
# Number of negative samples: 5

In [37]:
model3.build_vocab(corpus)

**Train the third Word2Vec model**

In [38]:
model3.train(sentences=corpus, total_examples=len(corpus), epochs=model1.iter)

  """Entry point for launching an IPython kernel.


(9137497, 10143600)

In [39]:
word_vectors = model3.wv
print('The vocabulary includes {} unique words.'.format(len(word_vectors.vocab)))
vector = model3.wv['nice']
print('The length of a word vector according to the parameter "size": ',len(vector))
print('Numpy vector of the word "nice":\n',vector)

The vocabulary includes 25037 unique words.
The length of a word vector according to the parameter "size":  50
Numpy vector of the word "nice":
 [ 4.00392711e-03  1.05703533e-01 -9.91758332e-02  3.60172629e-01
 -1.18538998e-01  2.26613432e-01  4.45170790e-01 -1.37967721e-01
 -5.91202676e-01  4.62506525e-02 -1.62097692e-01  4.69654292e-01
  1.39992833e-01  2.08870620e-01  1.17432944e-01 -3.54794294e-01
 -5.33169746e-01  9.03924227e-01 -1.55803129e-01 -2.71646321e-01
 -1.62918955e-01  3.96452516e-01  2.25625187e-01 -3.63199413e-01
  1.82856366e-01 -2.70609468e-01  1.65584609e-01 -2.31976509e-01
  8.82378139e-04  2.22830296e-01  2.50813842e-01 -8.83651674e-02
 -1.39806315e-01 -3.84776264e-01  3.47818851e-01 -4.52600211e-01
  7.62868822e-02  1.65184349e-01  3.34493697e-01 -3.45065355e-01
 -3.53405625e-01 -5.52335739e-01 -2.55021542e-01  2.25873664e-02
  4.80243206e-01  3.48123640e-01  1.04790926e+00 -5.33670902e-01
 -4.34281342e-02  2.46509537e-01]


**Find the most similar words**

In [40]:
print('Most similar words for "expensive"\n')
model3.wv.most_similar('expensive')

Most similar words for "expensive"



[('pricey', 0.918716311454773),
 ('overpriced', 0.8920618295669556),
 ('pricy', 0.8858973383903503),
 ('costly', 0.8527936339378357),
 ('cheaper', 0.8526806831359863),
 ('inflated', 0.8352290987968445),
 ('pricier', 0.8297056555747986),
 ('competitive', 0.817486584186554),
 ('prices', 0.8169474005699158),
 ('inexpensive', 0.8158344030380249)]

In [41]:
print('Most similar words for "great"\n')
model3.wv.most_similar('great')

Most similar words for "great"



[('fantastic', 0.9265666604042053),
 ('excellent', 0.9168330430984497),
 ('terrific', 0.9165868759155273),
 ('brilliant', 0.905572772026062),
 ('excellant', 0.8913705945014954),
 ('good', 0.8794829249382019),
 ('geat', 0.8719421029090881),
 ('wonderful', 0.8678866028785706),
 ('tremendous', 0.8653309941291809),
 ('fabulous', 0.854750394821167)]

In [42]:
print('Most similar words for "bad"\n')
model3.wv.most_similar('bad')

Most similar words for "bad"



[('terrible', 0.8347867131233215),
 ('horrible', 0.8318676948547363),
 ('awful', 0.8252952694892883),
 ('sucks', 0.8163849711418152),
 ('horrific', 0.8122080564498901),
 ('thats', 0.8046402335166931),
 ('downer', 0.8035178780555725),
 ('kidding', 0.8033908605575562),
 ('lousy', 0.8010213971138),
 ('darn', 0.7956486344337463)]

**Compare similarity of words**

In [43]:
# Similarity between two words
score__1 = model3.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score1)

score__2 = model3.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score2)

# Similarity between lists of words
print('Similarity between the lists: ',model3.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the lists: ',model3.wv.n_similarity(['beautiful','lovely'],['nice','great']))

Cosine similarity between "expensive" and "good" is: 0.50

Cosine similarity between "nice" and "good" is: 0.68

Similarity between the lists:  0.5071869

Similarity between the lists:  0.8593899


  
  """


In [44]:
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model3.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model3.wv.distance('friendly','nice'))

Distance between "terrible" and "horrible":  0.06601947546005249
Distance between "friendly" and "nice":  0.2544809579849243


In [45]:
# Find words that do not match
print('Word that does not match with the rest: ',model3.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim__1 = model3.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim__1[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim__2 = model3.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim__2[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

Word that does not match with the rest:  service
The most similar word is "courteous": 0.7571
The most similar word is "horrible": 0.9340


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


**COMPARING RESULTS OF THE 3 MODELS INCLUDING COMMENTS**

In [46]:
print('Most similar words for "expensive"\n')
print('Model 1')
print(model1.wv.most_similar('expensive'))

print('Model 2')
print(model2.wv.most_similar('expensive'))

print('Model 3')
print(model3.wv.most_similar('expensive'))

Most similar words for "expensive"

Model 1
[('pricey', 0.867865264415741), ('pricy', 0.8189133405685425), ('costly', 0.8106584548950195), ('overpriced', 0.8106163740158081), ('cheaper', 0.7656259536743164), ('pricier', 0.7643506526947021), ('costs', 0.7536859512329102), ('outrageous', 0.7531921863555908), ('inflated', 0.7527035474777222), ('priced', 0.7451817989349365)]
Model 2
[('pricey', 0.9085597991943359), ('overpriced', 0.8676849603652954), ('inexpensive', 0.760445773601532), ('cheaper', 0.7571960687637329), ('pricy', 0.7550462484359741), ('cheap', 0.7473670244216919), ('prices', 0.714169979095459), ('option', 0.7082355618476868), ('reasonable', 0.6970846652984619), ('fair', 0.6960881948471069)]
Model 3
[('pricey', 0.918716311454773), ('overpriced', 0.8920618295669556), ('pricy', 0.8858973383903503), ('costly', 0.8527936339378357), ('cheaper', 0.8526806831359863), ('inflated', 0.8352290987968445), ('pricier', 0.8297056555747986), ('competitive', 0.817486584186554), ('prices', 0.8

It seems that the first similar word for "expensive" in all models is the word "pricey", but it achieves the highest similarity score in model 3. It seems that the similar words for "expensive" are ordered differently with various similarity scores, while there are words that may be considered similar for one model, but not similar enough for another (for example, in model 2, the word "costly" is not included as most similar to the word "expensive" like the other models, the word "overpriced" achieved the second higher similarity score in models 2 and 3, but not in model 1). In model 3 the word "cheaper" achieves a very high similarity score, which does not seem absolutely correct.

In [47]:
print('Most similar words for "great"\n')
print('Model 1')
print(model1.wv.most_similar('great'))

print('Model 2')
print(model2.wv.most_similar('great'))

print('Model 3')
print(model3.wv.most_similar('great'))

# I think the word "good" might have achieved lower similarity score than words like "brilliant", "wonderful", "fabulous", "awesome" because it does not seem absolutely right according to vocabulary knowledge.

Most similar words for "great"

Model 1
[('terrific', 0.8998247385025024), ('fantastic', 0.8552649021148682), ('phenomenal', 0.8421623706817627), ('excellent', 0.8375718593597412), ('good', 0.8166176080703735), ('brilliant', 0.8117712736129761), ('excellant', 0.8039320707321167), ('awesome', 0.8012272119522095), ('wonderful', 0.7968956232070923), ('tremendous', 0.7966464757919312)]
Model 2
[('fantastic', 0.891322135925293), ('excellent', 0.8648792505264282), ('terrific', 0.8608647584915161), ('brilliant', 0.8455147743225098), ('wonderful', 0.8205458521842957), ('awesome', 0.8165668249130249), ('fabulous', 0.8154141902923584), ('good', 0.8147371411323547), ('amazing', 0.7870365381240845), ('superb', 0.786649227142334)]
Model 3
[('fantastic', 0.9265666604042053), ('excellent', 0.9168330430984497), ('terrific', 0.9165868759155273), ('brilliant', 0.905572772026062), ('excellant', 0.8913705945014954), ('good', 0.8794829249382019), ('geat', 0.8719421029090881), ('wonderful', 0.86788660287857

In model 1 the word "terrific" achieved the highest similarity score, while in the other two models the word "fantastic". Models 2 and 3 seem to have more similar results up to one point, whereas model 1 presents completely different results. I think the word "good" should have achieved a lower similarity score than words like "brilliant", "wonderful", "fabulous", "awesome" in all models, as it does not seem absolutely correct for someone who is aware of the meaning of the particular vocabulary.

In [52]:
print('Most similar words for "bad"\n')
print('Model 1')
print(model1.wv.most_similar('bad'))

print('Model 2')
print(model2.wv.most_similar('bad'))

print('Model 3')
print(model3.wv.most_similar('bad'))

Most similar words for "bad"

Model 1
[('horrible', 0.7663941383361816), ('terrible', 0.7481241226196289), ('sucks', 0.7375690937042236), ('dreadful', 0.7321875095367432), ('kidding', 0.7221542596817017), ('awful', 0.7106975317001343), ('lousy', 0.7034407258033752), ('darn', 0.698879599571228), ('critical', 0.697730302810669), ('complains', 0.6964166760444641)]
Model 2
[('terrible', 0.7914304733276367), ('horrible', 0.768926739692688), ('poor', 0.7583184242248535), ('awful', 0.7541587352752686), ('ok', 0.7479483485221863), ('okay', 0.7346817255020142), ('negative', 0.7130692005157471), ('worse', 0.7113240957260132), ('complain', 0.7010729312896729), ('thats', 0.6883893609046936)]
Model 3
[('terrible', 0.8347867131233215), ('horrible', 0.8318676948547363), ('awful', 0.8252952694892883), ('sucks', 0.8163849711418152), ('horrific', 0.8122080564498901), ('thats', 0.8046402335166931), ('downer', 0.8035178780555725), ('kidding', 0.8033908605575562), ('lousy', 0.8010213971138), ('darn', 0.795

It seems that model 1 scores higher the word "horrible", while models 2 and 3 score higher the word "terrible". Each model orders differently the score similarities of the word "bad".

In [49]:
print('\nModel 1\n')
# Similarity between two words for model 1
score1 = model1.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score1)

score2 = model1.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score2)

# Similarity between lists of words for model 1
print('Similarity between the pairs "beautiful-lovely" and "sucks-awful": ',model1.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the pairs "beautiful-lovely" and "nice-great": ',model1.wv.n_similarity(['beautiful','lovely'],['nice','great']))

print('\nModel 2\n')
# Similarity between two words for model 2
score_1 = model2.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score_1)

score_2 = model2.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score_2)

# Similarity between lists of words for model 2
print('Similarity between the "beautiful-lovely" and "sucks-awful": ',model2.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the pairs "beautiful-lovely" and "nice-great": ',model2.wv.n_similarity(['beautiful','lovely'],['nice','great']))

print('\nModel 3\n')
# Similarity between two words for model 3
score__1 = model3.similarity('expensive', 'good')
print('Cosine similarity between "expensive" and "good" is: %.2f\n' % score__1)

score__2 = model3.similarity('nice', 'good')
print('Cosine similarity between "nice" and "good" is: %.2f\n' % score__2)

# Similarity between lists of words for model 3
print('Similarity between the "beautiful-lovely" and "sucks-awful": ',model3.wv.n_similarity(['beautiful','lovely'],['sucks','awful']))
print('\nSimilarity between the pairs "beautiful-lovely" and "nice-great": ',model3.wv.n_similarity(['beautiful','lovely'],['nice','great']))

  This is separate from the ipykernel package so we can avoid doing imports until
  
  from ipykernel import kernelapp as app



Model 1

Cosine similarity between "expensive" and "good" is: 0.50

Cosine similarity between "nice" and "good" is: 0.68

Similarity between the pairs "beautiful-lovely" and "sucks-awful":  0.44613847

Similarity between the pairs "beautiful-lovely" and "nice-great":  0.795597

Model 2

Cosine similarity between "expensive" and "good" is: 0.49

Cosine similarity between "nice" and "good" is: 0.60

Similarity between the "beautiful-lovely" and "sucks-awful":  0.2962033

Similarity between the pairs "beautiful-lovely" and "nice-great":  0.7642074

Model 3

Cosine similarity between "expensive" and "good" is: 0.58

Cosine similarity between "nice" and "good" is: 0.74

Similarity between the "beautiful-lovely" and "sucks-awful":  0.5071869

Similarity between the pairs "beautiful-lovely" and "nice-great":  0.8593899




For the cosine similarity between the pairs "expensive-good", model 2 is more strict and achieved a lower similarity score, which seems more correct than in the other models. Model 3 proved that these words are more than 50% similar, which does not seem right.

For the cosine similarity between the pairs "nice-good", model 3 achieved the highest score of 74%, which seems correct, while model 2 the lowest (60%).

For the similarity between the pairs "beautiful-lovely" and "sucks-awful", model 2 achieved the lowest score (29.6%), which makes sense since these words are not very similar in meaning, whereas model 3 achieved a score of 50.7%.

For the similarity between the pairs "beautiful-lovely" and "nice-great", model 3 achieved the highest similarity score (86.5%) following model 1 with score 79.2%, which are considered good scores for these pairs. 

In [50]:
print('\nModel 1\n')
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model1.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model1.wv.distance('friendly','nice'))

print('\nModel 2\n')
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model2.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model2.wv.distance('friendly','nice'))

print('\nModel 3\n')
# Calculate distance between words
print('Distance between "terrible" and "horrible": ', model3.wv.distance('terrible','horrible'))
print('Distance between "friendly" and "nice": ', model3.wv.distance('friendly','nice'))


Model 1

Distance between "terrible" and "horrible":  0.11778444051742554
Distance between "friendly" and "nice":  0.4149606227874756

Model 2

Distance between "terrible" and "horrible":  0.048496782779693604
Distance between "friendly" and "nice":  0.418826699256897

Model 3

Distance between "terrible" and "horrible":  0.06601947546005249
Distance between "friendly" and "nice":  0.2544809579849243


Model 2 assigns a distance closer to zero which denotes greater similarity to the pairs "terrible-horrible",while model 3 the closer distance to the pair "friendly-nice". However, the dictance between the last pair could have been achieved less by the models.


In [51]:
print('\nModel 1\n')
# Find words that do not match for model 1
print('Word that does not match with the rest: ',model1.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim1 = model1.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim1[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim2 = model1.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim2[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

print('\nModel 2\n')
# Find words that do not match for model 2
print('Word that does not match with the rest: ',model2.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim_1 = model2.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim_1[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim_2 = model2.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim_2[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

print('\nModel 3\n')
# Find words that do not match for model 3
print('Word that does not match with the rest: ',model3.wv.doesnt_match(['brilliant','service','fabulous']))

# Find the most similar words by using the default "cosine similarity" measure
cos_sim_3 = model3.wv.most_similar(positive=['nice', 'friendly'], negative=['awful'])
most_similar_key, similarity = cos_sim_3[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')

cos_sim_4 = model3.wv.similar_by_word('terrible')
most_similar_key, similarity = cos_sim_4[0]  # look at the first match
print(f'The most similar word is "{most_similar_key}": {similarity:.4f}')


Model 1

Word that does not match with the rest:  service
The most similar word is "courteous": 0.6560
The most similar word is "horrible": 0.8822

Model 2

Word that does not match with the rest:  service
The most similar word is "pleasant": 0.7097
The most similar word is "horrible": 0.9515

Model 3

Word that does not match with the rest:  service
The most similar word is "courteous": 0.7571
The most similar word is "horrible": 0.9340


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


All three models successfully managed to distinguish the word that does not match the set of vocabulary given. 

Given positive and negative words, one word was found that is similar to the positive words and opposite to the negative word. It seems that in models 1 and 3 the word "courteous" is more similar to the words "nice" and "friendly" and more opposite to the word "awful" with model 3 achieving greater score. However, model 2 came up with a different word as result. In model 2, the word "pleasant" was found with a score of 70.9%.

It seems that in all the three models the word "horrible" is more similar to the word "terrible", with model 2 achieving the highest similarity score.

**OVERVIEW: CHARACTERISTICS OF MODELS**

**Model 1**

Training algorithm: Skip-Gram

Size of dimensionality of word vectors: 100

Window, the maximum distance between the target word and its neighboring word: 3

Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 2

The number of worker threads used to train the model: 10

Negative sampling, Number of negative samples: 5

**Model 2**

Training algorithm: CBOW

Size of dimensionality of word vectors: 100

Window, the maximum distance between the target word and its neighboring word: 3

Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 1

The number of worker threads used to train the model: 5

Negative sampling, Number of negative samples: 5

**Model 3**

Training algorithm: Skip-Gram

Size of dimensionality of word vectors: 50

Window, the maximum distance between the target word and its neighboring word: 3

Minimum count, ignoring all words with total frequency lower than this, words must appear this many times to be in vocab: 2

The number of worker threads used to train the model: 15

Negative sampling, Number of negative samples: 5

Models 1 and 3 are Skip-Gram Word2Vec models, meaning that they take as input one word and return as output multiple words according to the window size (3). Model 2 is a CBOW Word2Vec model, meaning that it takes the context of each word as the input and tries to predict the word corresponding to the context.  Multiple words are used as input per window size, but it returns one word as output. Apart from the type of training model, I experimented with the size of vector dimensionality by changing it from 100 to 50, the minimum count was used between 1 and 2, as well as the number of worker threads that changed from 10 to 5 to 15. The models presented many interesting results with more differences observed between them than similarities, since a few parameters were changed during building each model. 