In [33]:
import pandas as pd
import re
from bs4 import BeautifulSoup

imdb_dataset_path = 'IMDB Dataset.csv'

# read the data from csv file to a dataframe
imdb_data = pd.read_csv(imdb_dataset_path, delimiter=',', quotechar='"')
imdb_data = imdb_data.dropna().reset_index(drop=True)

# extract the "review" column
reviews = imdb_data['review']

def cleaned_data(review):
    if pd.notna(review):
        # remove html tags
        review = BeautifulSoup(review, 'html.parser').get_text()
        # remove punctuation - convert to lowercase
        review = re.sub(r'[^a-zA-Z\s]', '', review).strip().lower()
        return review
    else:
        return ''

# apply the function to each review
cleaned_reviews = reviews.apply(cleaned_data)

#remove duplicates
cleaned_reviews = cleaned_reviews.dropna().drop_duplicates()

#check the reviews with head command
print(cleaned_reviews.head())

  review = BeautifulSoup(review, 'html.parser').get_text()


0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically theres a family where a little boy j...
4    petter matteis love in the time of money is a ...
Name: review, dtype: object


In [34]:
import gensim
import io

print('Tokenizing reviews...')

# Track the total number of tokens in the dataset.
num_tokens = 0
# empty list for reviews to train
reviews = []

# For each review...
for i, review in enumerate(cleaned_reviews):
    # Report progress.
    if ((i % 20000) == 0):
        print('  Read {:,} reviews.'.format(i))

    # Tokenize the review. This returns a list of words.
    parsed = gensim.utils.simple_preprocess(review)
    # Accumulate the total number of words in the dataset.
    num_tokens += len(parsed)
    # Add the review to the list.
    reviews.append(parsed)

print('DONE.')
print('')
print('{:>10,} reviews'.format(i + 1))
print('{:>10,} tokens'.format(num_tokens))
print('{:>10,} avg. tokens / review'.format(int(num_tokens / len(reviews))))
print('')

Tokenizing reviews...
  Read 0 reviews.
  Read 20,000 reviews.
  Read 40,000 reviews.
DONE.

    49,580 reviews
10,683,902 tokens
       215 avg. tokens / review



In [35]:
import logging

# Enable logging at the `INFO` level and set a custom format--the
logging.basicConfig(
    format='%(asctime)s : %(message)s', # Display just time and message.
    datefmt='%H:%M:%S', # Display time, but not the date.
    level=logging.INFO)

In [49]:
#set the parameters
model = gensim.models.Word2Vec(
    vector_size=100,  # Number of features in word vector
    window=5,        # Context window size (in each direction)
    min_count=5,      # Words must appear this many times to be in vocab.
    workers=3,       # Training thread count
    sg=0,             # 0: CBOW, 1: Skip-gram. Default is 0, CBOW
    hs=0,             # 0: Negative Sampling, 1: Hierarchical Softmax Default is 0, NS
    negative=5        # Number of negative samples Default is 5
)

#build the vocabulary using the tokenized reviews
model.build_vocab(reviews, progress_per=20000)

print('Training the model...')

#train the model
model.train(
    reviews,
    total_examples=len(reviews),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)


Training the model...


(80342084, 106839020)

In [43]:
# find 5 most similar words to a given word

word = "bed"
similar_words = model.wv.most_similar(word, topn=5)
print(f"Words similar to {word}: {similar_words}")


Words similar to bed: [('bedroom', 0.6440098285675049), ('bathtub', 0.6371393799781799), ('chair', 0.6043896675109863), ('sleeping', 0.6001318693161011), ('bathroom', 0.5994760394096375)]


In [38]:
word1 ='intelligent'
word2 = 'smart'

# compute similarity between two words
similarity_score = model.wv.similarity(word1 , word2)
print(f"Similarity between {word1} and {word2}: {similarity_score}")

Similarity between intelligent and smart: 0.6015122532844543
The word that does not match: beautiful


In [None]:
# find word that does not match in a list
not_matching = model.wv.doesnt_match(['ugly', 'bad', 'beautiful', 'disappointing'])
print(f"The word that does not match: {not_matching}")

In [50]:
# analogy difference

analogy_result = model.wv.most_similar(positive=['good', 'bad'], negative=['better'], topn=5)
print(f"Analogy difference: {analogy_result}")

Analogy difference: [('badthe', 0.6677297353744507), ('lame', 0.6527222990989685), ('awful', 0.6282227039337158), ('stupid', 0.6139384508132935), ('cool', 0.6059989929199219)]


In [41]:
#TRAINING OF MODEL 2
model_2 = gensim.models.Word2Vec(
    vector_size=150,
    window=10,
    min_count=2,
    workers=10,
    sg=1,      #Skip-gram.
    hs=1,       #Hierarchical Softmax
    negative=10
)

In [42]:
# Build the vocabulary using the tokenized reviews
model_2.build_vocab(reviews, progress_per=20000)

print('Training the model...')

model.train(
    reviews,
    total_examples=len(reviews),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)

print('  Done.')
print('')



Training the model...
  Done.



In [51]:
#EXPERIMENTS ON MODEL 2
word = "bed"
similar_words = model_2.wv.most_similar(word, topn=5)
print(f"Words similar to {word}: {similar_words}")

word1 ='intelligent'
word2 = 'smart'

similarity_score = model_2.wv.similarity(word1 , word2)
print(f"Similarity between {word1} and {word2}: {similarity_score}")

not_matching = model_2.wv.doesnt_match(['ugly', 'bad', 'beautiful', 'disappointing'])
print(f"The word that does not match: {not_matching}")

analogy_result = model_2.wv.most_similar(positive=['good', 'bad'], negative=['better'], topn=5)
print(f"Analogy difference: {analogy_result}")


Words similar to bed: [('roleit', 0.35866743326187134), ('maths', 0.3582352101802826), ('lizzy', 0.33760711550712585), ('pilgers', 0.33652395009994507), ('stowes', 0.3319588005542755)]
Similarity between intelligent and smart: 0.05478953570127487
The word that does not match: disappointing
Analogy difference: [('oneyear', 0.33537420630455017), ('motormouthed', 0.33413413166999817), ('mcfarlane', 0.33332687616348267), ('harlows', 0.3136191964149475), ('implore', 0.30584606528282166)]


In [47]:
#TRAINING OF MODEL 3
model_3 = gensim.models.Word2Vec(
    vector_size=200,
    window=8,
    min_count=10,
    workers=5,
    sg=1,    #  Skip-gram
    hs=0,    # negative Sampling
    negative=7
)

# Build the vocabulary using the tokenized reviews
model_3.build_vocab(reviews, progress_per=20000)

print('Training the third model...')
model_3.train(
    reviews,
    total_examples=len(reviews),
    epochs=15,        # Increase the number of training epochs for better convergence
    report_delay=10.0 # Report progress every 10 seconds
)
print('  Done.')
print('')


Training the third model...
  Done.



In [48]:
#EXPERIMENTS ON MODEL 3

word = "bed"
similar_words = model_3.wv.most_similar(word, topn=5)
print(f"Words similar to {word}: {similar_words}")

word1 ='intelligent'
word2 = 'smart'
similarity_score = model_3.wv.similarity(word1 , word2)
print(f"Similarity between {word1} and {word2}: {similarity_score}")

not_matching = model_3.wv.doesnt_match(['ugly', 'bad', 'beautiful', 'disappointing'])
print(f"The word that does not match: {not_matching}")

analogy_result = model_3.wv.most_similar(positive=['good', 'bad'], negative=['better'], topn=5)
print(f"Analogy difference: {analogy_result}")


Words similar to bed: [('sleep', 0.5334925055503845), ('sleeping', 0.5220689177513123), ('curled', 0.5137509703636169), ('sofa', 0.501558780670166), ('trance', 0.49824783205986023)]
Similarity between intelligent and smart: 0.5877595543861389
The word that does not match: beautiful
Analogy difference: [('awful', 0.5075327754020691), ('terrible', 0.5051887035369873), ('badthe', 0.47965189814567566), ('hilariousthe', 0.4481655955314636), ('cool', 0.4473189115524292)]
