<a href="https://colab.research.google.com/github/HofstraDoboli/TextMining/blob/main/movie_feedback_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implement Rocchio feedback algorithm**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn import metrics
import scipy.sparse


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/TextMining
%ls *.csv

/content/drive/MyDrive/TextMining
wiki_movie_plots_deduped.csv


In [None]:
dir_file = os.getcwd() # returns path to current directory
files_dir = os.listdir(dir_file)  # list of files in current directory

csv_files = [f for f in files_dir if f.endswith('csv')]
print(csv_files)
movie_file = csv_files[0]

movie_df = pd.read_csv(movie_file)
print(movie_df.columns)  # the columns
print(movie_df.shape)

['wiki_movie_plots_deduped.csv']
Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')
(34886, 8)


In [None]:
movie_df.Genre.value_counts()[:40]

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
unknown,6083
drama,5964
comedy,4379
horror,1167
action,1098
thriller,966
romance,923
western,865
crime,568
adventure,526


In [None]:
new_movies2 = movie_df[movie_df['Release Year'] > 1900]
selected_movies = new_movies2[new_movies2.Genre.isin(['thriller', 'comedy', 'drama', 'science-fiction', 'sci-fi', 'adventure','western','biographic','crime','silent sports'])]

print(new_movies2.shape)

print(selected_movies.shape)

text_data = selected_movies['Title'] + ' ' + selected_movies['Plot']
text_data.head()

(34886, 8)
(13506, 8)


Unnamed: 0,0
6,The Great Train Robbery The film opens with tw...
7,The Suburbanite The film is about a family who...
14,How Brown Saw the Baseball Game Before heading...
15,Laughing Gas The plot is that of a black woman...
16,The Adventures of Dollie On a beautiful summer...


In [None]:
# genres
movie_df.Genre[movie_df.Genre == 'sci-fi'].value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
sci-fi,221


In [None]:
import numpy as np
import spacy   # another tokenizer, lemmatizer (has --> be)
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('parser', 'ner')

['parser', 'ner']

In [None]:
def nlp_processing(doc): # from indexing.ipyns
    tokens = nlp(doc)

    #print(type(tokens))
    # eliminates stop words  and non alpha num and converts all to lower case
    terms = [token.lemma_.lower() for token in tokens if token.is_alpha and (not token.is_stop)]

    return terms

In [None]:

# just term frequency in CountVectorizer
vect = CountVectorizer(tokenizer = nlp_processing, min_df = 2) # ? why 50

counter = vect.fit_transform(text_data)
 # matrix representation of each movie as a tf-idf vector
print(len(vect.get_feature_names_out())) # number of terms extracted (length of the vocabulary)

print(vect.get_feature_names_out()[:20])
print(vect.get_feature_names_out()[-20:])

print(counter.shape)
tf_all = counter.sum(axis = 0)
print(tf_all.shape)
ind_terms_sorted = tf_all.argsort()
print(ind_terms_sorted.shape)
print(tf_all[0,ind_terms_sorted[0,0:20]])
print(tf_all[0,ind_terms_sorted[0,-20:]])
print('rarest', vect.get_feature_names_out()[ind_terms_sorted[0,:20]])
print('most common', vect.get_feature_names_out()[ind_terms_sorted[0,-20:]])



34084
['aa' 'aadhi' 'aadi' 'aadmi' 'aadukalam' 'aaj' 'aakash' 'aaker' 'aakhri'
 'aalu' 'aamani' 'aami' 'aamir' 'aan' 'aana' 'aangan' 'aankhen' 'aapke'
 'aaron' 'aaronson']
['zurer' 'zurich' 'zurta' 'zutshi' 'zélie' 'zürich' 'à' 'álvarez' 'ángel'
 'édouard' 'éluard' 'émigré' 'émigrés' 'émile' 'étienne' 'évelyne' 'şahan'
 'šerbedžija' 'ഈ' 'കഥ']
(13506, 34084)
(1, 34084)
(1, 34084)
[[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]]
[[ 6428  6454  6758  6758  6948  7048  7082  7125  7443  7695  7731  7751
   8121  8359  8518  8575 11148 11425 11496 13029]]
rarest [['lanky' 'statistically' 'childlessness' 'statuesque' 'guilde' 'guilder'
  'childer' 'childcare' 'noisily' 'stays' 'stds' 'chiefly' 'chides'
  'nobuo' 'steakhouse' 'guineas' 'guirgis' 'chicano' 'gujarati' 'chicana']]
most common [['work' 'get' 'day' 'time' 'life' 'new' 'home' 'come' 'friend' 'try'
  'love' 'return' 'go' 'father' 'take' 'kill' 'tell' 'man' 'leave' 'find']]


In [None]:
# store the count vectorizer
import pickle
with open('count_vectorizer_movies_F24.pkl', 'wb') as f:
    pickle.dump(vect, f)

# save the counter
scipy.sparse.save_npz('counter_movies_F24.npz', counter)

In [None]:
# load the count vectorizer
import pickle
with open('count_vectorizer_movies_F24.pkl', 'rb') as f:
    vect = pickle.load(f)
# load the counter
counter = scipy.sparse.load_npz('counter_movies_F24.npz')

In [None]:
# compute df frequency from counter
np_counter = counter.toarray()
#print(counter.sum(axis = 0).min())
print(np_counter.shape)
np_counter[np_counter > 0] = 1 # set to 0 or 1 the counts so you can compute document frequency
print(np_counter.max())
doc_freq = np.sum(np_counter, axis = 0) # sum over the rows
print(doc_freq.shape)
#print(doc_freq[:20], max(doc_freq), min(doc_freq))
mat_doc_freq = np.reshape(doc_freq, (doc_freq.shape[0],1)) # reshape the doc_freq from a np array to a matrix [terms, 1]
print(doc_freq.shape, mat_doc_freq.shape)

# apply log function to the df - correcct
nr_docs = counter.shape[0]
print("Nr docs= ", nr_docs)

fct_doc_freq = lambda x: np.log(1+nr_docs/x) # apply the document frequency function
log_doc_freq = fct_doc_freq(mat_doc_freq)

print(mat_doc_freq[:5,0])
print(log_doc_freq[:5,0])


(13506, 34084)
1
(34084,)
(34084,) (34084, 1)
Nr docs=  13506
[ 9 10  2  2  7]
[7.31433088 7.20904435 8.8178902  8.8178902  7.56549731]


In [None]:
#inverted index


In [None]:
# tf_idf
# query = string
def tf_idf(query, vectorizer, term_freq, log_doc_freq, th_query_words_per_doc = 1):
  vect_query = vectorizer.transform(query) # shape (1,nr_terms)
  #vect_query_array = vect_query.toarray()
  #find non-zero items

  nz_terms_query = vect_query.nonzero()[1] # index non-zero query terms

  # find docs with at least x query terms

  docs_query = term_freq[:,nz_terms_query] # .sum(axis = 1) # all columns for the query terms
  docs_query[docs_query >= 1] = 1 # convert all term frequencies to 1
  #print(docs_query.shape)
  docs_query = docs_query.sum(axis = 1) # sum over the columns, find # num of query terms in each doc
  #print(docs_query.shape)
  #print(len(docs_query.nonzero()[0]))
  docs_query[docs_query < th_query_words_per_doc] = 0  # docs that have less that th_query_words_per_doc wods from query
  #print(len(docs_query.nonzero()[0]))
  docs_query =docs_query.nonzero()[0]
  #print(docs_query)

  print(nz_terms_query, docs_query)

  # compute similarity between query vector and doc_query tf-idf vectors
  similarity = dict(zip(docs_query, [0]*len(docs_query)))


  for doc in docs_query:
    for i_nz in nz_terms_query: # for each index of a non-zero element
      similarity[doc] += np.squeeze(vect_query[0,i_nz]* np.log(1+ term_freq[doc,i_nz]) *  log_doc_freq[i_nz]) # np.squeeze eliminates an extra dimension

  # sort similarity
  sorted_sim = dict(sorted(similarity.items(), key = lambda x: x[1], reverse = True))

  return sorted_sim, similarity, docs_query, nz_terms_query

In [None]:
# tf_idf
# query = string
def tf_idf2(vect_query, vectorizer, term_freq, log_doc_freq, th_query_words_per_doc = 1):
  #print(vect_query.shape)
  nz_terms_query = vect_query.nonzero()[1] # index non-zero query terms

  # find docs with at least x query terms

  docs_query = term_freq[:,nz_terms_query] # .sum(axis = 1) # all columns for the query terms
  docs_query[docs_query >= 1] = 1 # convert all term frequencies to 1
  #print(docs_query.shape)
  docs_query = docs_query.sum(axis = 1) # sum over the columns, find # num of query terms in each doc
  #print(docs_query.shape)
  #print(len(docs_query.nonzero()[0]))
  docs_query[docs_query < th_query_words_per_doc] = 0  # docs that have less that th_query_words_per_doc wods from query
  #print(len(docs_query.nonzero()[0]))
  docs_query =docs_query.nonzero()[0]
  #print(docs_query)

  print(nz_terms_query, docs_query)

  # compute similarity between query vector and doc_query tf-idf vectors
  similarity = dict(zip(docs_query, [0]*len(docs_query)))


  for doc in docs_query:
    for i_nz in nz_terms_query: # for each index of a non-zero element
      similarity[doc] += np.squeeze(vect_query[0,i_nz]* np.log(1+ term_freq[doc,i_nz]) *  log_doc_freq[i_nz]) # np.squeeze eliminates an extra dimension

  # sort similarity
  sorted_sim = dict(sorted(similarity.items(), key = lambda x: x[1], reverse = True))

  return sorted_sim, similarity, docs_query, nz_terms_query

In [None]:
# print text movies
import pprint
def print_movie(list_docs):
  for doc in list_docs:
    print('############################################')
    pprint.pp(selected_movies[['Release Year','Title', 'Genre']].iloc[doc])
    pprint.pp(selected_movies['Plot'].iloc[doc])

def print_title(list_docs):
  for doc in list_docs:
    print('############################################')
    pprint.pp(selected_movies[['Title']].iloc[doc])

In [None]:
q1 = ["Inspiring Pirate adventure"]

sorted_sim, similarity, docs_query, nz_terms_query = tf_idf(q1, vect, counter, log_doc_freq, 2)

q1_Docs = list(sorted_sim.keys())[:7]

q1_rel_docs = [2173, 11558, 660, 6348, 2091]

q1_un_rel_docs = [7793,11276]

print(f"\nQuery1 documents: {q1_Docs}")

print(f"\nQuery1 related documents: {q1_rel_docs}")

print(f"\nQuery1 unrelated documents: {q1_un_rel_docs}\n")

print(f"Top 3 Precision for \"Inspiring Pirate adventure\"")
print(f"Precision Value {3/3}\n")

print(f"Top 5 Precision for \"Inspiring Pirate adventure\"")
print(f"Precision Value {4/5}\n")

print(f"Top 7 Precision for \"Inspiring Pirate adventure\"")
print(f"Precision Value {5/7}\n")

print_movie(list(sorted_sim.keys())[:7])


[  382 15008 23028] [  299   660  2091  2173  3426  4490  4731  5537  5702  5709  6196  6348
  6785  7266  7305  7608  7793  8043  8248  8438  8516  9315 11276 11558]

Query1 documents: [2173, 11558, 660, 6348, 7793, 2091, 11276]

Query1 related documents: [2173, 11558, 660, 6348, 2091]

Query1 unrelated documents: [7793, 11276]

Top 3 Precision for "Inspiring Pirate adventure"
Precision Value 1.0

Top 5 Precision for "Inspiring Pirate adventure"
Precision Value 0.8

Top 7 Precision for "Inspiring Pirate adventure"
Precision Value 0.7142857142857143

############################################
Release Year            1945
Title           Captain Kidd
Genre              adventure
Name: 3980, dtype: object
('In 1699, pirate William Kidd (Charles Laughton) loots and destroys the '
 'London galleon The Twelve Apostles. He and three confederates bury the '
 'stolen treasure on a remote island. He then presents himself at the court of '
 'King William III (Henry Daniell) as an honest shipma

In [None]:
q2 = ["depressing ghost story"]

sorted_sim, similarity, docs_query, nz_terms_query = tf_idf(q2, vect, counter, log_doc_freq, 2)

q2_Docs = list(sorted_sim.keys())[:7]

q2_rel_docs = [1826, 11314, 8227, 11664, 11761, 12409]

q2_un_rel_docs = [4515]

print(f"\nQuery2 documents: {q2_Docs}")

print(f"\nQuery2 related documents: {q2_rel_docs}")

print(f"\nQuery2 unrelated documents: {q2_un_rel_docs}\n")

print(f"Top 3 Similarity for \"depressing ghost story\"")
print(f"Precision Value {3/3}\n")

print(f"Top 5 Similarity for \"depressing ghost story\"")
print(f"Precision Value {5/5}\n")

print(f"Top 7 Similarity for \"depressing ghost story\"")
print(f"Precision Value {6/7}\n")


print_movie(list(sorted_sim.keys())[:7])

[ 7717 11982 29127] [ 1513  1572  1826  2254  2406  2459  2853  3906  4515  4996  5987  5998
  6002  6108  6165  6372  6429  6569  6682  7049  7178  7350  7717  7769
  7941  8227  8355  8649  8689  8776  8778  8792  9401  9558 10126 10286
 10756 11074 11242 11314 11664 11679 11701 11744 11750 11761 11816 11879
 12017 12161 12232 12295 12357 12409 12546 12670 12780 12825 12915 13142
 13351]

Query2 documents: [1826, 11314, 8227, 11664, 11761, 4515, 12409]

Query2 related documents: [1826, 11314, 8227, 11664, 11761, 12409]

Query2 unrelated documents: [4515]

Top 3 Similarity for "depressing ghost story"
Precision Value 1.0

Top 5 Similarity for "depressing ghost story"
Precision Value 1.0

Top 7 Similarity for "depressing ghost story"
Precision Value 0.8571428571428571

############################################
Release Year                 1941
Title           The Smiling Ghost
Genre                      comedy
Name: 3190, dtype: object
('The elderly Mrs. Bentley (Helen Westley) and 

In [None]:
q3 = ["romantic comedy high school"]

sorted_sim, similarity, docs_query, nz_terms_query = tf_idf(q3, vect, counter, log_doc_freq, 2)

q3_Docs = list(sorted_sim.keys())[:7]

print(q3_Docs)

q3_rel_docs = [8593,7394,5482,6954]

q3_un_rel_docs = [7437,8389,6267]

print(f"\nQuery3 documents: {q3_Docs}")

print(f"\nQuery3 related documents: {q3_rel_docs}")

print(f"\nQuery3 unrelated documents: {q3_un_rel_docs}\n")

print(f"Top 3 Similarity for \"Based on a true story\"")
print(f"Precision Value {1/3}\n")

print(f"Top 5 Similarity for \"Based on a true story\"")
print(f"Precision Value {1/5}\n")

print(f"Top 7 Similarity for \"Based on a true story\"")
print(f"Precision Value {1/7}\n")

print_movie(list(sorted_sim.keys())[:7])

[ 5862 13667 25855 26735] [  463   541   593   642   721   892   930   937   966  1000  1006  1044
  1074  1121  1123  1132  1211  1214  1287  1332  1356  1364  1371  1485
  1554  1590  1612  1652  1653  1760  1844  1916  1986  2130  2136  2181
  2265  2270  2278  2282  2300  2310  2346  2360  2420  2576  2623  2666
  2672  2677  2688  2723  2731  2764  2797  2804  2820  2973  2976  3091
  3111  3145  3147  3257  3308  3416  3432  3463  3513  3540  3649  3683
  3711  3783  3792  3793  3858  3861  3904  3916  3987  3998  4014  4086
  4136  4145  4162  4184  4194  4201  4238  4318  4333  4338  4370  4372
  4453  4461  4478  4483  4486  4521  4527  4558  4626  4638  4674  4697
  4705  4812  4858  4875  4881  4899  4905  4984  4991  5031  5041  5046
  5063  5101  5138  5158  5164  5176  5185  5196  5211  5240  5241  5284
  5286  5307  5314  5331  5341  5366  5389  5390  5392  5400  5417  5423
  5432  5435  5446  5449  5462  5478  5479  5482  5495  5500  5515  5522
  5523  5528  5530  5531 

In [None]:
selected_movies[selected_movies['Title'].str.contains('Star wars')]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot


In [None]:
# find modified query vector
# compute df frequency from counter
# orig_query = text
# rel_docs, irel_docs = list of docs indices
def mod_query(orig_query, vectorizer, term_freq, log_doc_freq, rel_docs, not_rel_docs, alpha, beta, gamma):
  vect_orig = vectorizer.transform(orig_query)

  # compute centroid for rel and not_rel docs
  list_rel     = [term_freq.getrow(i) for i in rel_docs]
  list_not_rel = [term_freq.getrow(i) for i in not_rel_docs]



  # Check if list_rel and list_not_rel are empty before applying vstack
  is_empty_rel = len(list_rel) == 0
  is_empty_not_rel = len(list_not_rel) == 0


  if is_empty_rel and is_empty_not_rel:

      return np.array([])  # or any other appropriate default value

  # apply tf function
  nr_docs = term_freq.shape[0]
  fct_tf = lambda x: np.log(x + 1)

  vect_mod = alpha * vect_orig


  if list_rel:
    vect_rel = scipy.sparse.vstack(list_rel).toarray()
    np_rel   = fct_tf(vect_rel)           #  apply tf function
    np_rel   = np_rel * log_doc_freq.T    #  apply idf function
    max_tf_idf_rel     = np.max(np_rel)
    th_rel     = 0.3 * max_tf_idf_rel
    np_rel[np_rel < th_rel] = 0
    centroid_rel     = np.mean(np_rel, axis = 0)
    vect_mod +=  beta * centroid_rel

  if list_not_rel:
    vect_not_rel = scipy.sparse.vstack(list_not_rel).toarray()
    np_not_rel   = fct_tf(vect_not_rel)         # apply tf function
    np_not_rel   = np_not_rel * log_doc_freq.T  # apply idf function
    max_tf_idf_not_rel = np.max(np_not_rel)
    th_not_rel = 0.3 * max_tf_idf_not_rel
    np_not_rel[np_not_rel < th_not_rel] = 0
    centroid_not_rel = np.mean(np_not_rel, axis = 0)
    vect_mod -= gamma * centroid_not_rel
  # apply a threshold for tf_idf values

  #print(max_tf_idf_rel, max_tf_idf_not_rel)
  return vect_mod

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse

def one_rocchio_experiment(vectorizer, term_freq, log_doc_freq, alpha, beta, gamma, query_vector, relevant_docs, non_relevant_docs):
  modified_query = mod_query(query_vector, vectorizer, term_freq, log_doc_freq, relevant_docs, non_relevant_docs, alpha, beta, gamma)

  sorted_sim, similarity, docs_query, nz_terms_query = tf_idf2(modified_query, vectorizer, term_freq, log_doc_freq)

  return sorted_sim, similarity, docs_query, nz_terms_query

## Hyperparameters

In [None]:
alpha = 1
beta_values = [0.1,0.3,0.5,0.7,1]
gamma_values = [0.1,0.3,0.5,0.7,1]

In [None]:
orig_query1 = ["Inspiring Pirate adventure"]

print(f"most precise epoch/epochs:\n")

print(f"epoch 1 with hyperparameters alpha: {1} beta: {0.1} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 6 with hyperparameters alpha: {1} beta: {0.3} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 17 with hyperparameters alpha: {1} beta: {0.5} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 17 with hyperparameters alpha: {1} beta: {0.7} gamma: {0.3}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 21 with hyperparameters alpha: {1} beta: {1} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

most precise epoch/epochs:

epoch 6 with hyperparameters alpha: 1 beta: 0.3 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 15 with hyperparameters alpha: 1 beta: 0.5 gamma: 1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 20 with hyperparameters alpha: 1 beta: 0.7 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 25 with hyperparameters alpha: 1 beta: 1 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571


In [None]:
epoch = 0
for beta in beta_values:
    for gamma in gamma_values:
      sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, orig_query1, q1_rel_docs, q1_un_rel_docs)
      epoch+=1
      print("Top 7")
      print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}, epoch: {epoch}")
      print_title(list(sorted_sim.keys())[:7])

[  382   476   495   662   883  1153  1301  1333  1334  1928  2058  2069
  2159  2576  2699  2732  3238  3511  4402  4420  4519  4557  5130  5371
  5904  5943  6751  7024  7454  8036  8521  8761  9495 10060 10834 11128
 11158 11273 12284 12315 12319 12833 12834 13013 13774 13931 14173 14515
 15008 15797 16006 16209 16436 16921 17089 17476 17871 17896 17920 18155
 18301 18683 18946 19254 19458 20005 20173 20279 20557 20716 20983 21233
 21409 21998 22146 22274 22404 22779 23028 23344 23739 24189 24208 24742
 24989 25393 25487 25626 26074 26266 26362 26540 26693 26822 27260 28427
 29009 29013 29561 29616 29834 30171 30562 30659 30668 30817 30848 30898
 30946 30991 31087 31174 31421 31642 32288 32392 32927 33166 33256 33649] [    4     7    17 ... 13501 13503 13504]
Top 7
Hyper-Parameters: alpha:1, beta: 0.1, gamma: 0.1, epoch: 1
############################################
Title    Project A Part II
Name: 23217, dtype: object
############################################
Title    Captain K

In [None]:
print(f"most precise epoch/epochs:\n")

print(f"epoch 1 with hyperparameters alpha: {1} beta: {0.1} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 6 with hyperparameters alpha: {1} beta: {0.3} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 17 with hyperparameters alpha: {1} beta: {0.5} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 17 with hyperparameters alpha: {1} beta: {0.7} gamma: {0.3}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

print(f"epoch 21 with hyperparameters alpha: {1} beta: {1} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

most precise epoch/epochs:

epoch 1 with hyperparameters alpha: 1 beta: 0.1 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 6 with hyperparameters alpha: 1 beta: 0.3 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 17 with hyperparameters alpha: 1 beta: 0.5 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 17 with hyperparameters alpha: 1 beta: 0.7 gamma: 0.3, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571
epoch 21 with hyperparameters alpha: 1 beta: 1 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571


In [None]:
orig_query2 = ["depressing ghost story"]

epoch = 0
for beta in beta_values:
    for gamma in gamma_values:
      sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, orig_query2, q2_rel_docs, q2_un_rel_docs)
      epoch+=1
      print("Top 7")
      print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}, epoch: {epoch}")
      print_title(list(sorted_sim.keys())[:7])

[  139   801  2404  2406  2778  2845  3378  3489  4365  4980  6428  6608
  6609  7662  7717  8572  8638  8720  8729  9359 11292 11710 11951 11982
 12823 12888 13091 13862 14247 14501 15999 16114 16326 16398 16492 16644
 16716 16811 17562 17780 17813 17891 17922 18027 18113 18611 18636 18982
 19605 20386 20545 21035 21058 21404 21572 21729 22093 23736 24314 26426
 27399 27800 27801 27833 28654 28802 28810 29127 30290 30366 31426 31489
 31490 31761 33519] [    8    19    20 ... 13489 13502 13503]
Top 7
Hyper-Parameters: alpha:1, beta: 0.1, gamma: 0.1, epoch: 1
############################################
Title    Keyhole
Name: 22275, dtype: object
############################################
Title    Kimmy Dora and The Temple of Kiyeme
Name: 23794, dtype: object
############################################
Title    The Smiling Ghost
Name: 3190, dtype: object
############################################
Title    Ghost Town
Name: 15443, dtype: object
#######################################

In [262]:
print(f"most precise epoch/epochs:\n")

print(f"epoch 1 with hyperparameters alpha: {1} beta: {0.1} gamma: {0.1}, and precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

most precise epoch/epochs:

epoch 1 with hyperparameters alpha: 1 beta: 0.1 gamma: 0.1, and precision value of 3/3:0.6666666666666666 5/5:0.8 7/7:0.8571428571428571


In [None]:
orig_query3 = ["romantic comedy high school"]

epoch = 0
for beta in beta_values:
    for gamma in gamma_values:
      sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, orig_query3, q3_rel_docs, q3_un_rel_docs)
      epoch+=1
      print("Top 7")
      print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}, epoch: {epoch}")
      print_title(list(sorted_sim.keys())[:7])


[   24   863   986  1024  1191  1949  2233  2404  2449  2480  2504  2868
  3306  3788  4080  4104  4363  4656  5164  5467  5862  5895  6059  7161
  7196  7272  7598  8585  8612  8697  8749  8927  9045  9083  9487  9795
 10468 10553 11593 12136 12344 12565 12568 12800 12815 13478 13667 13873
 14092 15413 15704 15745 15779 15796 15877 15961 16346 16390 16564 17061
 17190 17279 17405 17883 18195 18462 18540 18936 19092 19636 20057 20323
 20568 20749 21488 21742 21844 21914 22045 22613 22737 22778 22819 23038
 23346 23361 23701 23834 24507 24717 25018 25549 25555 25752 25827 25855
 26360 26362 26488 26735 26751 26798 27014 27280 27419 27464 27823 27928
 29272 29886 30361 30642 30659 32563 32572 32682 33062 33102 33149 33272
 33583 33873] [    6     7    17 ... 13499 13502 13503]
Top 7
Hyper-Parameters: alpha:1, beta: 0.1, gamma: 0.1, epoch: 1
############################################
Title    Movie 43
Name: 16616, dtype: object
############################################
Title    High 

# Part 2 Expand your query with synonyms and thesaurus words. Try using https://pypi.org/project/wordhoard/ to find synonyms of words in the query (after you do nlp processing). If you have another idea for query expansion, please explain it and implement it.

## Use the same queries as before, expand them with additional words (e.g. synonyms, thesaurus) from your collection vocabulary.
## Apply the tf-idf algorithm to the expanded query. Show top 7 results for each query.
## Compute precision in top 3, top 5, and top 7.

In [267]:
!pip3 install wordhoard

Collecting wordhoard
  Downloading wordhoard-1.5.5-py3-none-any.whl.metadata (5.6 kB)
Collecting backoff>=2.2.1 (from wordhoard)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting cloudscraper>=1.2.71 (from wordhoard)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting deckar01-ratelimit>=3.0.2 (from wordhoard)
  Downloading deckar01_ratelimit-3.0.2-py3-none-any.whl.metadata (6.4 kB)
Collecting deepl>=1.18.0 (from wordhoard)
  Downloading deepl-1.19.1-py3-none-any.whl.metadata (27 kB)
Collecting lxml>=5.2.2 (from wordhoard)
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting requests-toolbelt>=1.0.0 (from wordhoard)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Downloading wordhoard-1.5.5-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.3/364.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backoff-2.2.1-p

In [282]:
from wordhoard import Synonyms

def findSynonyms(query):
  for word in query[0].split():
    synonym = Synonyms(search_string = word)
    synonym_result = synonym.find_synonyms()
    print(f"Synonyms for {word}\n")
    print(synonym_result)

In [293]:
from operator import ne
print(f"Query 1: {orig_query1}")
findSynonyms(orig_query1)

print(f"Query 2: {orig_query2}")
findSynonyms(orig_query2)

print(f"Query 3: {orig_query3}")
findSynonyms(orig_query3)

Query 1: ['Inspiring Pirate adventure']
Synonyms for Inspiring

['breathtaking', 'electric', 'electrifying', 'exciting', 'exhilarating', 'fascinating', 'galvanic', 'galvanizing', 'gripping', 'interesting', 'intoxicating', 'intriguing', 'mind-bending', 'mind-blowing', 'provocative', 'rousing', 'stimulating', 'thrilling']
Synonyms for Pirate

['buccaneer', 'corsair', 'freebooter', 'literary pirate', 'marauder', 'pirate ship', 'plagiariser', 'plagiarist', 'plagiarizer', 'plunderer', 'privateer', 'raider', 'robber', 'rover', 'sea robber', 'sea rover']
Synonyms for adventure

['dangerous undertaking', 'escapade', 'experience', 'exploit', 'feat', 'happening', 'ordeal', 'project', 'risky venture', 'scene', 'task', 'time', 'trip', 'undertaking']
Query 2: ['depressing ghost story']
Synonyms for depressing

['black', 'bleak', 'blue', 'cheer', 'cheerfulness', 'cheerless', 'chill', 'cloudy', 'cold', 'comfortless', 'dark', 'darkening', 'daunting', 'depressed', 'depressive', 'desolate', 'dire', 'dis

###HyperParameters

In [294]:
alpha = 1
beta= 0.3
gamma = 0.1

newQ1 = ["breathtaking pirate escapade"]
newQ2 = ["desolate phantom drama"]
newQ3 = ["glamorous comedy academy"]

In [295]:
sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, newQ1, q1_rel_docs, q1_un_rel_docs)
print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}")
print_title(list(sorted_sim.keys())[:7])

print(f"Precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

[  382   476   495   662   883  1153  1301  1333  1334  1928  2058  2069
  2159  2576  2699  2732  3238  3511  4402  4420  4519  4557  5130  5371
  5904  5943  6751  7024  7454  8036  8521  8761  9495  9845 10060 10834
 11128 11158 11273 12284 12315 12319 12833 12834 13013 13774 13931 14173
 14515 15797 16006 16209 16436 16921 17089 17476 17871 17896 17920 18155
 18301 18683 18946 19254 19458 20005 20173 20279 20557 20716 20983 21233
 21409 21998 22146 22274 22404 22779 23028 23344 23739 24189 24208 24742
 24989 25393 25487 25626 26074 26266 26362 26540 26693 26822 27260 28427
 29009 29013 29561 29616 29834 30171 30562 30659 30668 30817 30848 30898
 30946 30991 31087 31174 31421 31642 32288 32392 32927 33166 33256 33649] [    4     7    17 ... 13501 13503 13504]


INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/high
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/depressing
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/ghost
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's 

Hyper-Parameters: alpha:1, beta: 0.3, gamma: 0.1
############################################
Title    Project A Part II
Name: 23217, dtype: object
############################################
Title    Captain Kidd
Name: 3980, dtype: object
############################################
Title    Hook
Name: 11646, dtype: object
############################################
Title    Captain Applejack
Name: 1080, dtype: object
############################################
Title    Abbott and Costello Meet Captain Kidd
Name: 5555, dtype: object
############################################
Title    Double Crossbones
Name: 5346, dtype: object
############################################
Title    Ambrose Applejohn's Adventure
Name: 457, dtype: object


INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403


In [298]:
sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, newQ2, q2_rel_docs, q2_un_rel_docs)
print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}")
print_title(list(sorted_sim.keys())[:7])

print(f"Precision value of 3/3:{2/3} 5/5:{4/5} 7/7:{6/7}")

[  139   801  2404  2406  2778  2845  3378  3489  4365  4980  6428  6608
  6609  7662  7795  8572  8638  8720  8729  8768  9359 11292 11710 11951
 11982 12823 12888 13091 13862 14247 14501 15999 16114 16326 16398 16492
 16644 16716 16811 17562 17780 17813 17891 17922 18027 18113 18611 18636
 18982 19605 20386 20545 21035 21058 21404 21572 21729 22093 22820 23736
 24314 26426 27399 27800 27801 27833 28654 28802 28810 30290 30366 31426
 31489 31490 31761 33519] [    8    19    34 ... 13459 13465 13503]


INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/school
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/ghost
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/story
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDo

Hyper-Parameters: alpha:1, beta: 0.3, gamma: 0.1
############################################
Title    Keyhole
Name: 22275, dtype: object
############################################
Title    Kimmy Dora and The Temple of Kiyeme
Name: 23794, dtype: object
############################################
Title    The Smiling Ghost
Name: 3190, dtype: object
############################################
Title    Ghost Town
Name: 15443, dtype: object
############################################
Title    Mummy
Name: 27544, dtype: object
############################################
Title    Goynar Baksho
Name: 24119, dtype: object
############################################
Title    Children
Name: 34492, dtype: object


INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/Inspiring
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403


In [302]:
sorted_sim, similarity, docs_query, nz_terms_query = one_rocchio_experiment(vect, counter, log_doc_freq, alpha, beta, gamma, newQ3, q3_rel_docs, q3_un_rel_docs)
print(f"Hyper-Parameters: alpha:{alpha}, beta: {beta}, gamma: {gamma}")
print_title(list(sorted_sim.keys())[:7])
print(f"Precision value of 3/3:{3/3} 5/5:{5/5} 7/7:{5/7}")

INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.


[   24   147   863   986  1024  1191  1949  2233  2404  2449  2480  2504
  2868  3306  3788  4080  4104  4363  4656  5164  5467  5862  5895  6059
  7161  7196  7272  7598  8585  8612  8697  8749  8927  9045  9083  9487
  9795 10468 10553 11593 12115 12136 12344 12565 12568 12800 12815 13478
 13873 14092 15413 15704 15745 15779 15796 15877 15961 16346 16390 16564
 17061 17190 17279 17405 17883 18195 18462 18540 18936 19092 19636 20057
 20323 20568 20749 21488 21742 21844 21914 22045 22613 22737 22778 22819
 23038 23346 23361 23701 23834 24507 24717 25018 25549 25555 25752 25827
 26360 26362 26488 26735 26751 26798 27014 27280 27419 27464 27823 27928
 29272 29886 30361 30642 30659 32563 32572 32682 33062 33102 33149 33272
 33583 33873] [    7    17    18 ... 13499 13502 13503]


INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/school
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/Inspiring
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:The requested URL is protected by Cloudflare's DDoS mitigation service.
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.com/dictionary/english-thesaurus/comedy
INFO:wordhoard.utilities.cloudflare_bypass:Status Code: 403
INFO:wordhoard.utilities.cloudflare_bypass:Requested URL: https://www.collinsdictionary.

Hyper-Parameters: alpha:1, beta: 0.1, gamma: 0.1
############################################
Title    Movie 43
Name: 16616, dtype: object
############################################
Title    High School High
Name: 12724, dtype: object
############################################
Title    Never Been Kissed
Name: 13524, dtype: object
############################################
Title    Baby It's You
Name: 10050, dtype: object
############################################
Title    Defying Gravity
Name: 12958, dtype: object
############################################
Title    Police Academy 3: Back in Training
Name: 10596, dtype: object
############################################
Title    Let's Be Cops
Name: 16840, dtype: object
Precision value of 3/3:1.0 5/5:1.0 7/7:0.7142857142857143


========== END OF QUERY MODIFICATION ==========

In [None]:
a= np.array([[1,2,3], [4,5,6]])
b = np.array([1,0.5,0.5])
print(a,b)
dp = a * b.T
print(dp)
print(b[:,None])

In [None]:
query = ["aliens spaceship"]
vect_query = vect.transform(query)
vect_query_array = vect_query.toarray()
print(vect_query_array.shape)
words_query = vect.inverse_transform(vect_query)
print(words_query)

In [None]:
# Corrected code
vect1 = CountVectorizer(stop_words="english", lowercase=True, min_df=10)
counter1 = vect1.fit_transform(selected_movies['Plot'])
print(len(vect1.get_feature_names_out()))

print(vect1.get_feature_names_out()[:100])
print(vect1.get_feature_names_out()[-20:])


In [None]:
transf  = TfidfTransformer(norm = None, sublinear_tf = True) # norm = None => no normalization just tf-idf vectors
# TfidfTransformer takes the CountVectorizer output and computes the tf-idf
tf_idf = transf.fit_transform(counter)

In [None]:
from sklearn.metrics import pairwise_distances
m = 'cosine' # 'l2', 'cosine'
print(np.mean(pairwise_distances(tf_idf, metric = m)))
print(np.max(pairwise_distances(tf_idf, metric = m)))
#print(np.min(pairwise_distances(tf_idf, metric = m)))


In [None]:
# use nltk kmeans clusterer
from nltk.cluster import KMeansClusterer, cosine_distance, euclidean_distance
k_clusters = 5
model_nltk = KMeansClusterer(k_clusters,distance = cosine_distance, repeats=10, conv_test=1e-03, initial_means=None,
                             normalise=False, svd_dimensions= None, rng=None, avoid_empty_clusters=False)

out_kmeans = model_nltk.cluster(tf_idf.toarray(), assign_clusters= True, trace = False) # returns a list with cluster indices

In [None]:
print(len(out_kmeans))
labels = np.array(out_kmeans)
centroids_nltk = np.array(model_nltk.means())
print(centroids_nltk.shape) # list of arrays

In [None]:
import math
def my_cos_dist(u, v):
    return  1 - (np.dot(u, v) / (math.sqrt(np.dot(u, u)) * math.sqrt(np.dot(v, v))))

d = my_cos_dist(vect_query_array[0,:],  centroids_nltk[0,:])
print(d)

In [None]:
dist2centroid = np.zeros(k_clusters)
print(dist2centroid.shape)
for i in range(k_clusters):
  dist2centroid[i] = my_cos_dist(vect_query_array[0,:], centroids_nltk[i,:]) # dist_q2centr[i] =
print(dist2centroid)
index_sorted = np.argsort(dist2centroid)
print(index_sorted)
closest_cluster = index_sorted[0]
print(closest_cluster)

In [None]:
# compute distances to movies inside the closest cluster
# index to the movies inside the closest cluster
index_movies = np.where(labels == closest_cluster)[0]
print(index_movies[:5])
print(labels[index_movies[:5]])
dist2movies = np.zeros(len(index_movies)) # a vectod of zeros
tf_idf_array = tf_idf.toarray() # convert to full matrix from sparse matrix
print(tf_idf_array.shape)

for i, ind in  enumerate(index_movies):
  dist2movies[i] = my_cos_dist(vect_query_array[0,:], tf_idf_array[ind,:])

arg_sort_movies = np.argsort(dist2movies, axis = 0)
print(dist2movies[arg_sort_movies[:5]])
closest_movies = arg_sort_movies[:5]
print(closest_movies)
print(dist2movies[closest_movies])
for m in closest_movies:
  index_movie = index_movies[m]
  print(index_movie)
  print(selected_movies.iloc[index_movie].Plot)

In [None]:
# compute distances to movies inside the closest cluster
# index to the movies inside the closest cluster

dist_all_movies = np.zeros(len(tf_idf_array))
for i in range(len(tf_idf_array)):
  dist_all_movies[i] = my_cos_dist(vect_query_array[0,:], tf_idf_array[i,:])

arg_sort_all_movies = np.argsort(dist_all_movies, axis = 0)
closest_all_movies = arg_sort_all_movies[:5]
print(closest_all_movies)
print(labels[closest_all_movies])
print(dist_all_movies[closest_all_movies])
for m in closest_all_movies:
  print(m)
  print(selected_movies.iloc[m].Plot)

In [None]:
print("\nSilhouette Coefficient: %0.3f" %metrics.silhouette_score(tf_idf, out_kmeans, metric = "cosine"))

In [None]:
print("Top terms per cluster:")
order_centroids = centroids_nltk.argsort()[:, ::-1]  # sort and reverse the weights for each term
terms = vect.get_feature_names_out()

for i in range(k_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:  # print first ten terms from the cluster
        print(' %s' % terms[ind]),
    print

In [None]:
from collections import Counter
# compute purity
# in each cluster - find # of dramas/ # commedies
# choose the max in each cluster = purity per cluster
# sum purity in each cluster/# number of movies
import numpy as np
genre = pd.Categorical(selected_movies.Genre)

#print(genre.shape, index_cluster1.shape, index_cluster2.shape)
purity = np.zeros((k_clusters,1))
for c in range(k_clusters):
    # extract labeks of each genre and count them
    index_cluster = labels == c;
    count_genre = Counter(genre[index_cluster])
    print(c, count_genre)
    purity[c] = max(count_genre.values())

total_purity = np.sum(purity)/len(genre)
print(total_purity)

In [None]:
# plot clusters
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components = 2)
#print('explained variance')
pca.fit(tf_idf)
x2 = pca.transform(tf_idf)

print(x2.shape)
# add labels
data_x2 = pd.DataFrame(x2, columns = ['x','y'])
data_x2['label'] = labels
data_x2['orig_label'] = genre
data_x2.head()

sb.lmplot(data=data_x2, x='x', y='y', hue='orig_label',fit_reg=False, legend=True, legend_out=True)
sb.lmplot(data=data_x2, x='x', y='y', hue='label',fit_reg=False, legend=True, legend_out=True)

In [None]:
inertia = {}
for k_clusters in range(2,11):
  model = KMeans(n_clusters = k_clusters,n_init = 10, verbose = False) # repeats the algorithm for 10 initializations
# returns the best model
# model = MiniBatchKMeans(n_clusters=k_clusters, init='k-means++', max_iter=100, batch_size = 5000,
#                        n_init = 10, verbose = 0)

  model.fit(tf_idf)
  inertia[k_clusters] = model.inertia_
  print("\nSilhouette Coefficient: %0.3f" %metrics.silhouette_score(tf_idf, model.labels_, metric = "l2"))

In [None]:
inertia

In [None]:
# choose k_clusters
k_clusters = 3
model = KMeans(n_clusters = k_clusters,n_init = 50, verbose = False) # repeats the algorithm for 10 initializations
model.fit(tf_idf)
print("\nSilhouette Coefficient: %0.3f" %metrics.silhouette_score(tf_idf, model.labels_, metric = "l2"))

In [None]:
print("Top terms per cluster:")
print(model.cluster_centers_.shape)
order_centroids = model.cluster_centers_.argsort()[:, ::-1]  # sort and reverse the weights for each term
terms = vect.get_feature_names_out()

for i in range(k_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:  # print first ten terms from the cluster
        print(' %s' % terms[ind]),
    print

In [None]:
# compute homogeneity with genre labels

# get genre for the selected movies
genre = pd.Categorical(selected_movies.Genre)

print("Homogeneity: %0.3f"  % metrics.homogeneity_score(genre, model.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(genre, model.labels_))
print("V-measure: %0.3f"    % metrics.v_measure_score(genre, model.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(genre, model.labels_))

In [None]:
from collections import Counter
# compute purity
# in each cluster - find # of dramas/ # commedies
# choose the max in each cluster = purity per cluster
# sum purity in each cluster/# number of movies
import numpy as np

#print(genre.shape, index_cluster1.shape, index_cluster2.shape)
purity = np.zeros((k_clusters,1))
for c in range(k_clusters):
    # extract labeks of each genre and count them
    index_cluster = model.labels_ == c;
    count_genre = Counter(genre[index_cluster])
    print(c, count_genre)
    purity[c] = max(count_genre.values())

total_purity = np.sum(purity)/len(genre)
print(total_purity)

In [None]:
cluster = 4
index = model.labels_ == cluster
selected_movies[index]

In [None]:
# plot clusters
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components = 2)
#print('explained variance')
pca.fit(tf_idf)
x2 = pca.transform(tf_idf)

In [None]:
print(x2.shape)
# add labels
data_x2 = pd.DataFrame(x2, columns = ['x','y'])
data_x2['label'] = model.labels_
data_x2['orig_label'] = genre
data_x2.head()

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='orig_label',fit_reg=False, legend=True, legend_out=True)

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='label',fit_reg=False, legend=True, legend_out=True)

In [None]:
# check some data points - ??

# find index for drama movies
index_drama =  selected_movies['Genre'].isin(['thriller'])
index_comedy = selected_movies['Genre'].isin(['western'])

print(sorted(Counter(model.labels_[index_drama]).items(),key =
             lambda kv:(kv[1], kv[0]), reverse =True))
print(sorted(Counter(model.labels_[index_comedy]).items(),key =
             lambda kv:(kv[1], kv[0]), reverse =True))


##SVD dimensionality reduction

In [None]:
# project Tfidf model onto singular value decomposition - LSI transform and then do clustering
from sklearn.decomposition import TruncatedSVD

dim = 1000 # 1300 dimensions explain 95% variance
svd = TruncatedSVD(n_components=dim, n_iter = 10)

lsi = svd.fit_transform(tf_idf)
explained_variance = svd.explained_variance_ratio_.sum()
print("Sum of explained variance ratio: %d%%" % (int(explained_variance * 100)))

In [None]:
lsi.shape


In [None]:
from sklearn.metrics import pairwise_distances
m = 'l1' # 'l2', 'cosine'
print(np.mean(pairwise_distances(tf_idf, metric = m)))
print(np.max(pairwise_distances(tf_idf, metric = m)))

print(np.mean(pairwise_distances(lsi, metric = m)))
print(np.max(pairwise_distances(lsi, metric = m)))
#print(np.min(pairwise_distances(tf_idf, metric = m)))

In [None]:
# check singular values
print(svd.singular_values_[:min(dim,20)])

In [None]:
k_clusters = 10
#model_lsi = MiniBatchKMeans(n_clusters=k_clusters, init='k-means++', max_iter=200, batch_size=5000,
#                       n_init = 10)
model_lsi = KMeans(n_clusters=k_clusters, init='k-means++', max_iter=300,
                        n_init = 10)
model_lsi.fit(lsi)
print("\nSilhouette Coefficient: %0.3f" %metrics.silhouette_score(tf_idf, model_lsi.labels_, metric = "euclidean"))

In [None]:
from collections import Counter
# compute purity
# in each cluster - find # of dramas/ # commedies
# choose the max in each cluster = purity per cluster
# sum purity in each cluster/# number of movies
import numpy as np

genre = pd.Categorical(selected_movies.Genre)


#print(genre.shape, index_cluster1.shape, index_cluster2.shape)
purity = np.zeros((k_clusters,1))
for c in range(k_clusters):
    # extract labeks of each genre and count them
    index_cluster = model_lsi.labels_ == c;
    count_genre = Counter(genre[index_cluster])
    print(c, count_genre)
    purity[c] = max(count_genre.values())

total_purity = np.sum(purity)/len(genre)
print(total_purity)

In [None]:
# plot clusters
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components = 2)
#print('explained variance')
pca.fit(tf_idf)
x2 = pca.transform(tf_idf)

In [None]:
print(x2.shape)
# add labels
data_x2 = pd.DataFrame(x2, columns = ['x','y'])
data_x2['label'] = model_lsi.labels_
data_x2['orig_label'] = genre
data_x2.head()

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='orig_label',fit_reg=False, legend=True, legend_out=True)

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='label',fit_reg=False, legend=True, legend_out=True)

##Latent Dirichlet Allocation (LDA) for dimensionality reduction

In [None]:
# LDA with sklearn
from sklearn.decomposition import LatentDirichletAllocation

num_of_topics = 20 # more topics, better to separate the clusters.
lda_transf = LatentDirichletAllocation(
            n_components = num_of_topics, max_iter= 10,
            learning_method = 'online', batch_size = 128)   # 'online' - faster, uses subset of data
lda = lda_transf.fit_transform(counter)
print(lda.shape)
print(lda_transf.components_.shape)

In [None]:
# cluster based on LDA
k_clusters = 3
model_lda = KMeans(k_clusters)
#model_lda = MiniBatchKMeans(n_clusters=k_clusters, init='k-means++', max_iter=200, batch_size=1000,
#                       n_init = 10)
model_lda.fit(lda)
print("\nSilhouette Coefficient: %0.3f" %metrics.silhouette_score(tf_idf, model_lsi.labels_, metric = "euclidean"))

In [None]:
# compute homogeneity with genre labels

# get genre for the selected movies
genre = selected_movies.Genre.copy()
genre = pd.Categorical(genre)
# labels_genre = genre.codes
# print(type(labels_genre), labels_genre.shape)

print("Homogeneity: %0.3f"  % metrics.homogeneity_score(genre, model_lda.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(genre, model_lda.labels_))
print("V-measure: %0.3f"    % metrics.v_measure_score(genre, model_lda.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(genre, model_lda.labels_))

In [None]:
# print top topic indices per cluster
print("Top top words per cluster:")
terms = vect.get_feature_names_out()

# TO DO: order terms in a cluster by tf-idf not just tf
# find movies in each cluster, sum up their counts and print the top words
for k in range(k_clusters):
    index_movies_k = model_lda.labels_ == k
    count_terms_k = sum(counter[index_movies_k,:])   # sum per columns (overall frequency)
    order_terms = count_terms_k.toarray().argsort()[:,::-1].ravel()  # convert to a 1D array
    print('Cluster', k)

    for t in order_terms[:10]:
        print('\t', terms[t], count_terms_k[0,t])

#order_centroids = model_lda.cluster_centers_.argsort()[:, ::-1]  # sort and reverse
#for i in range(k_clusters):
#print("Cluster %d:" % i),
#    for ind in order_centroids[i, :5]:  # print first ten terms from the cluster
#.   print(ind, model_lda.cluster_centers_[i,ind])

In [None]:
from collections import Counter
# compute purity
# in each cluster - find # of dramas/ # commedies
# choose the max in each cluster = purity per cluster
# sum purity in each cluster/# number of movies
import numpy as np

#print(genre.shape, index_cluster1.shape, index_cluster2.shape)
purity = np.zeros((k_clusters,1))
for c in range(k_clusters):
    # extract labeks of each genre and count them
    index_cluster = model_lda.labels_ == c;
    count_genre = Counter(genre[index_cluster])
    print(c, count_genre)
    purity[c] = max(count_genre.values())

total_purity = np.sum(purity)/len(genre)
print(total_purity)

In [None]:
# plot clusters
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components = 2)
#print('explained variance')
pca.fit(tf_idf)
x2 = pca.transform(tf_idf)

In [None]:
print(x2.shape)
# add labels
data_x2 = pd.DataFrame(x2, columns = ['x','y'])
data_x2['label'] = model_lda.labels_
data_x2['orig_label'] = genre
data_x2.head()

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='orig_label',fit_reg=False, legend=True, legend_out=True)

In [None]:
sb.lmplot(data=data_x2, x='x', y='y', hue='label',fit_reg=False, legend=True, legend_out=True)

In [None]:
# get largest distribution of words over topics
feature_names = vect.get_feature_names_out()  # Use the updated method
for topic_idx, topic in enumerate(lda_transf.components_):
    print("Topic %d:" % (topic_idx))
    words = []
    for i in topic.argsort()[:-11:-1]:
        words.append(feature_names[i])
    print(words)


In [None]:
# get distribution of each document over the 10 topics
print(lda[100,:])
print(selected_movies.iloc[100].Plot)
print(model_lda.labels_[100])

In [None]:
# get distribution of drama and comedy films over topics
import numpy as np

index_western =  selected_movies['Genre'].isin(['western'])
index_thriller  = selected_movies['Genre'].isin(['thriller'])
index_sci_fi  = selected_movies['Genre'].isin(['sci-fi'])

print(lda[index_western].mean(axis = 0))
print(lda[index_thriller].mean(axis = 0))
print(lda[index_sci_fi].mean(axis = 0))

In [None]:
# compute purity
# in each cluster - find # of dramas/ # commedies
# choose the max in each cluster = purity per cluster
# sum purity in each cluster/# number of movies
import numpy as np

#print(genre.shape, index_cluster1.shape, index_cluster2.shape)
purity = np.zeros((k_clusters,1))
for c in range(k_clusters):
    # extract labeks of each genre and count them
    index_cluster = model_lda.labels_ == c;
    count_genre = Counter(genre[index_cluster])
    print(c, count_genre)
    purity[c] = max(count_genre.values())

total_purity = np.sum(purity)/len(genre)
print(total_purity)

In [None]:
# print top topic indices per cluster
print("Top topic indices per cluster:")
order_centroids = model_lda.cluster_centers_.argsort()[:, ::-1]  # sort and reverse

for i in range(k_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :5]:  # print first ten terms from the cluster
        print(ind, model_lda.cluster_centers_[i,ind])


In [None]:
# find the closest movie to a another movie using tf-idf, lda and lsi embeddings

def similar_movie(movie_id, all_embed):

    nr_movies   = all_embed.shape[0]  # number of rows = nr of movies
    movie_embed = all_embed[movie_id,:]


    dist = all_embed.dot(movie_embed.transpose())
    dist[movie_id] = 0;
    print('Most similar movie:', dist.max(), dist.argmax())

    return dist.argmax()

def print_movie(movie_id, all_movies):
    index_title = selected_movies.columns.get_loc('Title')
    index_plot  = selected_movies.columns.get_loc('Plot')
    print(all_movies.iloc[movie_id, index_title],'\n')
    print('\t', all_movies.iloc[movie_id,index_plot])

movie_id = 7594 # 17663 # Ninja terminator 8244 # Indiana Jones # 7595 - Star wars # 2000
print('Original movie')
print_movie(movie_id, selected_movies)
print('\ntf-idf most similar')
similar_tf_idf = similar_movie(movie_id,tf_idf)
print_movie(similar_tf_idf, selected_movies)

print('\nlsi most similar')
similar_lsi    = similar_movie(movie_id,lsi)
print_movie(similar_lsi, selected_movies)

print('\nlda most similar')
similar_lda    = similar_movie(movie_id, lda)
print_movie(similar_lda, selected_movies)


print(selected_movies(100))

In [None]:
# find the index of the star war movie
titles = selected_movies['Title']
index_found = [i for i,t in enumerate(titles.str.find('Star W')) if t != -1]
print(index_found)
print(selected_movies.iloc[index_found[0]])

In [None]:
# show movies in the same cluster