In [1]:
# Add packages

# Import our regular old heroes
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy.
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Entity featurization and similarity computation
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

#!unzip "drive/My Drive/train.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/test.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/tags.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/movies.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/links.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/imdb_data.csv.zip" -d "drive/My Drive"
#!unzip "drive/My Drive/genome_scores.csv.zip" -d "drive/My Drive"


In [4]:
# load datasets
df_train = pd.read_csv('drive/My Drive/train.csv')
df_test = pd.read_csv('drive/My Drive/test.csv')
df_movies = pd.read_csv('drive/My Drive/movies.csv')
df_imdb_data = pd.read_csv('drive/My Drive/imdb_data.csv')
df_genome_tags = pd.read_csv('drive/My Drive/genome_tags.csv')
df_genome_scores = pd.read_csv('drive/My Drive/genome_scores.csv')
df_tags = pd.read_csv('drive/My Drive/tags.csv')
df_links = pd.read_csv('drive/My Drive/links.csv')


In [5]:
# get sample of train data due to large size

N_movies  = 10000
df_movies = df_movies.iloc[range(0,N_movies),:]

In [6]:
# create dictionary of counts for each table

countdict = {
  "                      Train_Count": df_train.count(),
  "         Movies_Count": df_movies.count(),
  "         Imdb_Count": df_imdb_data.count(),
  "         Genome_Tags_Count": df_genome_tags.count(),
  "         Genome_Scores_Count": df_genome_scores.count(),
  "         Test_Count": df_test.count(),
  "         Tags_Count": df_tags.count(),
  "         Links_Count": df_links.count()
}
print(countdict)

{'                      Train_Count': userId       10000038
movieId      10000038
rating       10000038
timestamp    10000038
dtype: int64, '         Movies_Count': movieId    10000
title      10000
genres     10000
dtype: int64, '         Imdb_Count': movieId          27278
title_cast       17210
director         17404
runtime          15189
budget            7906
plot_keywords    16200
dtype: int64, '         Genome_Tags_Count': tagId    1128
tag      1128
dtype: int64, '         Genome_Scores_Count': movieId      15584448
tagId        15584448
relevance    15584448
dtype: int64, '         Test_Count': userId     5000019
movieId    5000019
dtype: int64, '         Tags_Count': userId       1093360
movieId      1093360
tag          1093344
timestamp    1093360
dtype: int64, '         Links_Count': movieId    62423
imdbId     62423
tmdbId     62316
dtype: int64}


In [7]:
# View heads of dataframe
df_train.head(), df_test.head(),df_movies.head()

(   userId  movieId  rating   timestamp
 0    5163    57669     4.0  1518349992
 1  106343        5     4.5  1206238739
 2  146790     5459     5.0  1076215539
 3  106362    32296     2.0  1423042565
 4    9041      366     3.0   833375837,
    userId  movieId
 0       1     2011
 1       1     4144
 2       1     5767
 3       1     6711
 4       1     7318,
    movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  )

In [8]:
# View heads of dataframe
df_imdb_data.head(), df_genome_tags.head(), df_genome_scores.head()

(   movieId                                         title_cast  \
 0        1  Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...   
 1        2  Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...   
 2        3  Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...   
 3        4  Whitney Houston|Angela Bassett|Loretta Devine|...   
 4        5  Steve Martin|Diane Keaton|Martin Short|Kimberl...   
 
               director  runtime       budget  \
 0        John Lasseter     81.0  $30,000,000   
 1   Jonathan Hensleigh    104.0  $65,000,000   
 2  Mark Steven Johnson    101.0  $25,000,000   
 3       Terry McMillan    124.0  $16,000,000   
 4       Albert Hackett    106.0  $30,000,000   
 
                                        plot_keywords  
 0                   toy|rivalry|cowboy|cgi animation  
 1                   board game|adventurer|fight|game  
 2                         boat|lake|neighbor|rivalry  
 3  black american|husband wife relationship|betra...  
 4                    fath

In [9]:
# View heads of dataframe
df_test.head(), df_tags.head(), df_links.head()

(   userId  movieId
 0       1     2011
 1       1     4144
 2       1     5767
 3       1     6711
 4       1     7318,
    userId  movieId               tag   timestamp
 0       3      260           classic  1439472355
 1       3      260            sci-fi  1439472256
 2       4     1732       dark comedy  1573943598
 3       4     1732    great dialogue  1573943604
 4       4     7569  so bad it's good  1573943455,
    movieId  imdbId   tmdbId
 0        1  114709    862.0
 1        2  113497   8844.0
 2        3  113228  15602.0
 3        4  114885  31357.0
 4        5  113041  11862.0)

In [10]:
# get a subset of the train data
# clean data and create feature space
# dimensionality reduction
# create cosine similarity
# do the predictions
# calculate RMSE


In [11]:
# combine movie data
df_movies_content = pd.merge(df_movies, df_imdb_data, on=['movieId'], how='left')
df_movies_content.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        10000 non-null  int64  
 1   title          10000 non-null  object 
 2   genres         10000 non-null  object 
 3   title_cast     5241 non-null   object 
 4   director       5247 non-null   object 
 5   runtime        4866 non-null   float64
 6   budget         2942 non-null   object 
 7   plot_keywords  5176 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 703.1+ KB


In [12]:
df_movies_content.head(100)

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Grumpier Old Men (1995),Comedy|Romance,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Father of the Bride Part II (1995),Comedy,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...,...,...
95,97,"Hate (Haine, La) (1995)",Crime|Drama,Vincent Cassel|Hubert Koundé|Saïd Taghmaoui|Ab...,Mathieu Kassovitz,98.0,"EUR2,590,000",police|arab|gun|ghetto
96,98,Shopping (1994),Action|Thriller,Sadie Frost|Jude Law|Sean Pertwee|Fraser James...,Paul W.S. Anderson,105.0,,shopping|shop|shopping mall|stealing
97,99,Heidi Fleiss: Hollywood Madam (1995),Documentary,Nick Broomfield|Nina Xining Zuo|Madam Alex|Cor...,Nick Broomfield,106.0,,narration|prostitution
98,100,City Hall (1996),Drama|Thriller,Al Pacino|John Cusack|Bridget Fonda|Danny Aiel...,Ken Lipper,111.0,"$40,000,000",mayor|dealer|drug dealer|deputy mayor


In [13]:
nltk.download(['punkt','stopwords'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
# initialize

tokenizer = nltk.RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')


In [16]:
# Tokenize, Lemmatize, Stopwords

def text_to_token(s):
    tokens = tokenizer.tokenize(s)
    lower_cased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lower_cased_tokens]
    useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]
    return useful_tokens

In [17]:
def clean_text_two(text):
    #text = str(text)
    text = re.sub('[\d-]', '', text) #remove any numbers
    text = re.sub(r'RT', '', text) #remove the retweets
    return text

In [18]:
def clean_director(text):
    #lower case
    tokens = tokenizer.tokenize(text)
    lower_cased_tokens = [t.lower() for t in tokens]
    #join on space
    return ''.join(lower_cased_tokens)

In [19]:
def clean_genres(text):

    if text == '(no genres listed)':
        clean_text = ''
    else:
        text = text.split('|')
        clean_text = [t.lower() for t in text]
        clean_text = ' '.join(clean_text)

    return clean_text

In [20]:
def clean_title_cast(text):

    text = text.split('|')
    clean_text = [t.lower() for t in text]
    clean_text = [i.replace(" ", "") for i in clean_text]
    clean_text = ' '.join(clean_text)

    return clean_text

In [21]:
def remove_punctuation(text):
    return ''.join([l for l in text if l not in string.punctuation])

In [22]:
# clean title
df_movies_content['title'] = df_movies_content['title'].fillna('')
df_movies_content['title'] = df_movies_content['title'].apply(remove_punctuation)
df_movies_content['title'] = df_movies_content['title'].apply(text_to_token)
df_movies_content['title'] = df_movies_content['title'].apply(lambda x: " ".join(x))

In [23]:
# clean director

df_movies_content['director'] = df_movies_content['director'].fillna('')
df_movies_content['director'] = df_movies_content['director'].apply(clean_director)


In [24]:
# clean genre

df_movies_content['genres'] = df_movies_content['genres'].fillna('')
df_movies_content['genres'] = df_movies_content['genres'].apply(clean_genres)


In [25]:
# clean keywords
df_movies_content['plot_keywords'] = df_movies_content['plot_keywords'].fillna('')
df_movies_content['plot_keywords'] = df_movies_content['plot_keywords'].apply(clean_genres)
df_movies_content['plot_keywords'] = df_movies_content['plot_keywords'].apply(text_to_token)
df_movies_content['plot_keywords'] = df_movies_content['plot_keywords'].apply(lambda x: " ".join(x))

In [26]:
# clean title_cast
df_movies_content['title_cast'] = df_movies_content['title_cast'].fillna('')
df_movies_content['title_cast'] = df_movies_content['title_cast'].apply(clean_title_cast)

In [27]:
df_movies_content.tail()

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords
9995,33750,innocent voice vox inocentes 2004,drama war,carlospadilla leonorvarela gustavomuñoz joséma...,luismandoki,120.0,,1980s el salvador boy young love
9996,33755,godzilla v biollante gojira v biorante 1989,action sci-fi,,,,,
9997,33760,anna king siam 1946,drama romance,,,,,
9998,33767,good neighbor sam 1964,comedy,,,,,
9999,33779,eddie izzard dress kill 1999,comedy,eddieizzard,lawrencejordan,,,stand special stand stand comedian 2nd amendment


In [28]:
df_movies_content['movie_soup'] = (pd.Series(df_movies_content[['title','genres','title_cast','director','plot_keywords']]
                                                .fillna('').values.tolist()).str.join(' '))

# indexes to map between movie titles and indexes of the movies dataframe
#titles = df_movies_content['title']
indices = pd.Series(df_movies_content.index, index=df_movies_content['movieId'])

In [29]:
df_movies_content.head()

Unnamed: 0,movieId,title,genres,title_cast,director,runtime,budget,plot_keywords,movie_soup
0,1,toy story 1995,adventure animation children comedy fantasy,tomhanks timallen donrickles jimvarney wallace...,johnlasseter,81.0,"$30,000,000",toy rivalry cowboy cgi animation,toy story 1995 adventure animation children co...
1,2,jumanji 1995,adventure children fantasy,robinwilliams jonathanhyde kirstendunst bradle...,jonathanhensleigh,104.0,"$65,000,000",board game adventurer fight game,jumanji 1995 adventure children fantasy robinw...
2,3,grumpier old men 1995,comedy romance,waltermatthau jacklemmon sophialoren ann-margr...,markstevenjohnson,101.0,"$25,000,000",boat lake neighbor rivalry,grumpier old men 1995 comedy romance waltermat...
3,4,waiting exhale 1995,comedy drama romance,whitneyhouston angelabassett lorettadevine lel...,terrymcmillan,124.0,"$16,000,000",black american husband wife relationship betra...,waiting exhale 1995 comedy drama romance whitn...
4,5,father bride part ii 1995,comedy,stevemartin dianekeaton martinshort kimberlywi...,alberthackett,106.0,"$30,000,000",fatherhood doberman dog mansion,father bride part ii 1995 comedy stevemartin d...


In [30]:
indices.count()

10000

In [31]:
#tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')

from sklearn.feature_extraction.text import CountVectorizer
tf = CountVectorizer(ngram_range = (1,2))

# Produce a feature matrix, where each row corresponds to a movie, with vectorized features as columns
tf_movie_soup_matrix = tf.fit_transform(df_movies_content['movie_soup'])

In [32]:
print (tf_movie_soup_matrix.shape)

(10000, 187826)


In [33]:
# add other numerical features here

In [34]:
#PCA
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

svd = TruncatedSVD(n_components = 10)

svd_trunc_tf_movie_soup_matrix = csr_matrix(svd.fit_transform(tf_movie_soup_matrix))

In [35]:
print(svd_trunc_tf_movie_soup_matrix.shape)

(10000, 10)


In [36]:
cosine_sim_movie_soup = cosine_similarity(svd_trunc_tf_movie_soup_matrix, svd_trunc_tf_movie_soup_matrix)

print (cosine_sim_movie_soup.shape)

(10000, 10000)


In [37]:
# Sort train data and limit > 33779
df_train = df_train.sort_values('movieId')
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
7064283,120877,1,3.0,1135874095
3761551,113986,1,4.0,975218175
9733691,5152,1,4.0,853732793
8766289,93168,1,4.0,1273344738
341554,108001,1,3.0,945974900


In [38]:
# drop rows with movie id > 101
df_train_cropped = df_train[df_train['movieId'] < 10001]
df_train_cropped.tail()

Unnamed: 0,userId,movieId,rating,timestamp
3560774,85844,9019,3.0,1123845993
368958,20055,9019,2.0,1169103924
4802835,2177,9019,2.0,1209766894
2738864,39905,9019,4.0,1101799135
4818801,24412,9019,4.0,1146603902


In [39]:
# reindex tables

df_users_info = df_train_cropped
df_users_info['userId_movieId'] = df_users_info['userId'].apply(str) + "_" + df_users_info['movieId'].apply(str)

df_users_info = df_users_info.drop(['timestamp'], axis = 1)
df_users_info.tail()


Unnamed: 0,userId,movieId,rating,userId_movieId
3560774,85844,9019,3.0,85844_9019
368958,20055,9019,2.0,20055_9019
4802835,2177,9019,2.0,2177_9019
2738864,39905,9019,4.0,39905_9019
4818801,24412,9019,4.0,24412_9019


In [40]:
def content_generate_rating_estimate(Comb_Id):

    k=2
    threshold=0.9

    user = int(Comb_Id.split('_')[0])
    movie_id = int(Comb_Id.split('_')[1])

    # Convert the movie id to a numeric index for our similarity matrix
    b_idx = indices[movie_id]
    neighbors = [] # <-- Stores our collection of similarity values

    # Gather the similarity ratings between each movie the user has rated and the reference movie
    for index, row in df_users_info[df_users_info['userId']==user].iterrows():
        sim = cosine_sim_movie_soup[b_idx-1, indices[row['movieId']]-1]
        neighbors.append((sim, row['rating']))

    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scores and
    # user item ratings.
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:

        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user.
        # We use the average rating for the reference item as a proxy in this case
        predictedRating = np.mean(df_users_info[df_users_info['movieId']==movie_id]['rating'])
    return predictedRating

In [41]:
df_users_info['predictedRating'] = df_users_info['userId_movieId'].apply(content_generate_rating_estimate)

KeyboardInterrupt: ignored

In [42]:
df_users_info.head()

Unnamed: 0,userId,movieId,rating,userId_movieId
7064283,120877,1,3.0,120877_1
3761551,113986,1,4.0,113986_1
9733691,5152,1,4.0,5152_1
8766289,93168,1,4.0,93168_1
341554,108001,1,3.0,108001_1


In [None]:
from sklearn.metrics import mean_squared_error

#convert column to numpy array
actual_r = df_users_info['rating'].to_numpy()
predicted_r = df_users_info['predictedRating'].to_numpy()

rmse = np.sqrt(mean_squared_error(actual_r, predicted_r))
rmse

In [None]:
# run predictions on test data

In [None]:
# create submission csv
