In [1]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel


In [2]:
df = pd.read_parquet('data/movies_clean.pq')
pd.set_option('display.width', 1000)


In [3]:
# Reduce dataset size limiting the release year to be greater than 1950
df.title = df.title.astype(str)
df = df [df['release_year']>1950]

In [4]:
def clean_text(text):
    ''' Clean text function for content column
        Make text lowercase,remove punctuation
    .'''  
    text = str(text).lower().replace('collection','')
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

df['belongs_to_collection'].fillna('', inplace=True)
df['content'] = df['title'] + " " + df['overview'] + " " + df['genres'] + ' ' +\
    df['belongs_to_collection'].replace('collection','').replace('series','').replace('trilogy','')\
    + ' ' + df['director']

#Create the content column for model input
df['content'] = df['content'].apply(lambda x:clean_text(x))

In [5]:
df['content']

0        toy story led by woody andys toys live happily...
1        jumanji when siblings judy and peter discover ...
2        grumpier old men a family wedding reignites th...
3        waiting to exhale cheated on mistreated and st...
4        father of the bride part ii just when george b...
                               ...                        
45423    caged heat 3000 its the year 3000 ad the world...
45424    robin hood yet another version of the classic ...
45425    century of birthing an artist struggles to fin...
45426    betrayal when one of her hits goes wrong a pro...
45428    queerama 50 years after decriminalisation of h...
Name: content, Length: 38804, dtype: object

In [6]:
# Split dataframe in 2 

# Movies produced in United States
df1 = df [df['production_countries'].str.contains('United States of America')]
print(df1.shape)

# Movies NOT produced in United States
df2 = df [~df['production_countries'].str.contains('United States of America')]
print(df2.shape)

del df

(16943, 20)
(21861, 20)


In [7]:
# Create ids column to link title with recomendation

pd.options.mode.chained_assignment = None  # disable warning
df1["ids"]=[i for i in range(0,df1.shape[0])]
df1.tail(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,runtime,status,title,vote_average,vote_count,release_year,return,director,content,ids
45423,,0.0,['Science Fiction'],222848,en,It's the year 3000 AD. The world's most danger...,0.661558,['Concorde-New Horizons'],['United States of America'],1995-01-01,...,85.0,Released,Caged Heat 3000,3.5,1.0,1995,0.0,Aaron Osborne,caged heat 3000 its the year 3000 ad the world...,16940
45424,,0.0,"['Drama', 'Action', 'Romance']",30840,en,"Yet another version of the classic epic, with ...",5.683753,"['Westdeutscher Rundfunk (WDR)', 'Working Titl...","['Canada', 'Germany', 'United Kingdom', 'Unite...",1991-05-13,...,104.0,Released,Robin Hood,5.7,26.0,1991,0.0,John Irvin,robin hood yet another version of the classic ...,16941
45426,,0.0,"['Action', 'Drama', 'Thriller']",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,['American World Pictures'],['United States of America'],2003-08-01,...,90.0,Released,Betrayal,3.8,6.0,2003,0.0,Mark L. Lester,betrayal when one of her hits goes wrong a pro...,16942


TF-IDF vectorizer Model

In [8]:
#create the TF-IDF model
tfidf = TfidfVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))
content = df1.content
tfidf_matrix = tfidf.fit_transform(content)
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

COUNT Vectorizer NOT USED (TfidfVectorizer gives better results)

In [9]:
# # Count Vectorizer

# content = q.dropna()
# count_vectorizer = CountVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))
# count_matrix = count_vectorizer.fit_transform(content)
# cosine_similarities = None
# cosine_similarities = linear_kernel(count_matrix, count_matrix)



In [10]:
# create list for recomendations column
all_recomendations  = []

for i in df1.ids:
    cosine_similarity_scores = list(enumerate(cosine_similarities[i]))
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    recomendations = []
    for s in cosine_similarity_scores[1:6]:
        title = df1[df1.ids==s[0]]['title'].to_string(index=False)
        recomendations.append(title)

    all_recomendations.append(recomendations)


In [11]:
#Create recomendation column
df1['recomendations'] = all_recomendations

In [12]:
### Same for df2

# Create ids column to link title with recomendation
df2["ids"]=[i for i in range(0,df2.shape[0])]

#create the TF-IDF model
tfidf = TfidfVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))
content = df2.content
tfidf_matrix = tfidf.fit_transform(content)
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# create list for recomendations column
all_recomendations  = []

for i in df2.ids:
    cosine_similarity_scores = list(enumerate(cosine_similarities[i]))
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    recomendations = []
    for s in cosine_similarity_scores[1:6]:
        title = df1[df1.ids==s[0]]['title'].to_string(index=False)
        recomendations.append(title)

    all_recomendations.append(recomendations)

#Create recomendation column
df2['recomendations'] = all_recomendations

In [14]:
df1.recomendations[0] #Toy Story

['Toy Story 2',
 'Toy Story 3',
 'Toy Story of Terror!',
 'Toy Story That Time Forgot',
 'Small Fry']

In [None]:
df2.recomendations[50]


In [24]:
# Save df1 and df2  with recomendations back into one
pd.concat([df1, df2]).to_parquet('data/movies_acotado.pq')
