In [1]:
import pandas as pd
import numpy as np
import re
import ast
import string
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel


In [2]:
df = pd.read_parquet('data/movies_clean.pq')
df.sample(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45429 entries, 0 to 45428
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4495 non-null   object 
 1   budget                 45429 non-null  float64
 2   genres                 45429 non-null  object 
 3   id                     45429 non-null  int64  
 4   original_language      45418 non-null  object 
 5   overview               45429 non-null  object 
 6   popularity             45429 non-null  object 
 7   production_companies   45429 non-null  object 
 8   production_countries   45429 non-null  object 
 9   release_date           45429 non-null  object 
 10  revenue                45429 non-null  float64
 11  runtime                45183 non-null  float64
 12  status                 45349 non-null  object 
 13  title                  45429 non-null  object 
 14  vote_average           45429 non-null  float64
 15  vo

In [3]:

# df['production_countries'].info()
#df['United States of America' in df['production_countries'].cont]
#'United States of America' in df['production_countries'][0]

df.title = df.title.astype(str)
df = df [df['release_year']>1950]

df = df [df['production_countries'].str.contains('United States of America')]

print(df.shape)
df  = df.drop_duplicates(subset='title', keep='last')
df.shape



(18410, 19)


(17724, 19)

In [4]:
def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    text = str(text).lower().replace('collection','')
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

df['belongs_to_collection'].fillna('', inplace=True)
df['content'] = df['title'] + " " + df['overview'] + " " + df['genres'] + ' ' +\
                df['belongs_to_collection'] + ' ' + df['director']


df['content'] = df['content'].apply(lambda x:clean_text(x))


In [5]:
df['content']

0        toy story led by woody andys toys live happily...
1        jumanji when siblings judy and peter discover ...
2        grumpier old men a family wedding reignites th...
3        waiting to exhale cheated on mistreated and st...
4        father of the bride part ii just when george b...
                               ...                        
45418    the morning after the morning after is a featu...
45422    the burkittsville 7 a film archivist revisits ...
45423    caged heat 3000 its the year 3000 ad the world...
45424    robin hood yet another version of the classic ...
45426    betrayal when one of her hits goes wrong a pro...
Name: content, Length: 17724, dtype: object

In [6]:
df["ids"]=[i for i in range(0,df.shape[0])]
df.tail(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,runtime,status,title,vote_average,vote_count,release_year,return,director,content,ids
45423,,0.0,['Science Fiction'],222848,en,It's the year 3000 AD. The world's most danger...,0.661558,['Concorde-New Horizons'],['United States of America'],1995-01-01,...,85.0,Released,Caged Heat 3000,3.5,1.0,1995,0.0,Aaron Osborne,caged heat 3000 its the year 3000 ad the world...,17721
45424,,0.0,"['Drama', 'Action', 'Romance']",30840,en,"Yet another version of the classic epic, with ...",5.683753,"['Westdeutscher Rundfunk (WDR)', 'Working Titl...","['Canada', 'Germany', 'United Kingdom', 'Unite...",1991-05-13,...,104.0,Released,Robin Hood,5.7,26.0,1991,0.0,John Irvin,robin hood yet another version of the classic ...,17722
45426,,0.0,"['Action', 'Drama', 'Thriller']",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,['American World Pictures'],['United States of America'],2003-08-01,...,90.0,Released,Betrayal,3.8,6.0,2003,0.0,Mark L. Lester,betrayal when one of her hits goes wrong a pro...,17723


In [7]:
#create the TF-IDF model
# MAX_DF     = 0.95
# MIN_DF     = 1#2
tfidf = TfidfVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))

content = df.content#.dropna()

tfidf_matrix = tfidf.fit_transform(content)


cosine_similarities = None
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [8]:
df.shape


(17724, 21)

In [9]:
# # Count Vectorizer

# content = q.dropna()
# count_vectorizer = CountVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))
# count_matrix = count_vectorizer.fit_transform(content)
# cosine_similarities = None
# cosine_similarities = linear_kernel(count_matrix, count_matrix)



In [10]:
all_recomendations  = []

for i in df.ids:
    cosine_similarity_scores = list(enumerate(cosine_similarities[i]))
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
    recomendations = []
    for s in cosine_similarity_scores[1:6]:
        title = df[df.ids==s[0]]['title'].to_string(index=False)
        recomendations.append(title)

    all_recomendations.append(recomendations)
     

    





In [11]:
df['recomendations'] = all_recomendations

In [12]:
df.to_parquet('data/movies_acotado.pq')

In [13]:
df.recomendations[0]

['Toy Story 2',
 'Toy Story 3',
 'Toy Story of Terror!',
 'Toy Story That Time Forgot',
 'Small Fry']

In [14]:
i = 0
cosine_similarity_scores = list(enumerate(cosine_similarities[i]))
cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
recomendations = []
for s in cosine_similarity_scores[1:6]:
    recomendations.append(df[df.ids==s[0]]['title'].to_string(index=False))
    print (df[df.ids==s[0]]['title'].to_string(index=False))

print (recomendations)



Toy Story 2
Toy Story 3
Toy Story of Terror!
Toy Story That Time Forgot
Small Fry
['Toy Story 2', 'Toy Story 3', 'Toy Story of Terror!', 'Toy Story That Time Forgot', 'Small Fry']


In [15]:

cosine_similarity_scores = list(enumerate(cosine_similarities[i]))
cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
cosine_similarity_scores 
df.loc[df.ids==0]['title'].to_string(index=False)


'Toy Story'

In [16]:
#del df
df = pd.read_parquet('data/movies_clean.pq')
# df.sample(3)
# ast.literal_eval(df.recomendations[0])
peli = 'Toy Story'
peli = df[df.title == peli]
list(peli.iloc[0].recomendations)
# a  = print(peli.recomendations.to_string(index=False))
# print (a)

# ast.literal_eval(peli.recomendations.to_string(index=False))

#recomendations = list(peli.recomendations)

AttributeError: 'Series' object has no attribute 'recomendations'

In [17]:
productora = 'Tristar Pictures'
df[df.production_companies.str.contains(productora, case=False)]

#df[df['ids'].str.contains("ball")]
# df.production_companies[0]

# df.production_companies.info()

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,status,title,vote_average,vote_count,release_year,return,director
1,,65000000.0,"['Adventure', 'Fantasy', 'Family']",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"['TriStar Pictures', 'Teitler Film', 'Intersco...",['United States of America'],1995-12-15,262797249.0,104.0,Released,Jumanji,6.9,2413.0,1995,4.04,Joe Johnston
90,,47000000.0,"['Drama', 'Horror', 'Thriller', 'Romance']",9095,en,A housemaid falls in love with Dr. Jekyll and ...,12.866139,"['TriStar Pictures', 'NFH Productions']",['United States of America'],1996-02-23,12379402.0,104.0,Released,Mary Reilly,5.7,77.0,1996,0.26,Stephen Frears
116,,0.0,"['Comedy', 'Romance']",10324,en,Joe and Lucy are roommates and best friends. L...,2.891918,"['TriStar Pictures', 'Motion Picture Corporati...",['United States of America'],1996-03-08,0.0,92.0,Released,If Lucy Fell,5.4,11.0,1996,0.00,Eric Schaeffer
118,,0.0,"['Action', 'Adventure', 'Comedy', 'Drama', 'Fa...",55731,en,"A bunch of high school misfits in Hawaii, intr...",1.317878,['TriStar Pictures'],['United States of America'],1996-03-22,0.0,100.0,Released,Race the Sun,5.2,21.0,1996,0.00,Charles T. Kanganis
161,,27000000.0,"['Drama', 'Mystery', 'Thriller']",8512,en,"In late 1940s Los Angeles, Easy Rawlins is an ...",4.646132,"['TriStar Pictures', 'Mundy Lane Entertainment']",['United States of America'],1995-09-15,16140822.0,102.0,Released,Devil in a Blue Dress,6.3,88.0,1995,0.60,Carl Franklin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42164,Trainspotting Collection,18000000.0,"['Crime', 'Drama']",180863,en,"After 20 years abroad, Mark Renton returns to ...",17.645877,"['DNA Films', 'TriStar Pictures', 'Cloud Eight...",['United Kingdom'],2017-01-27,41412709.0,117.0,Released,T2 Trainspotting,7.1,795.0,2017,2.30,Danny Boyle
42960,,0.0,"['Drama', 'Family']",13841,en,A BMX racer who lives in a small town with his...,5.624953,['TriStar Pictures'],['Canada'],1986-03-21,2015882.0,91.0,Released,Rad,7.1,27.0,1986,0.00,Hal Needham
42996,,0.0,"['Action', 'Drama']",15982,en,"Average Texas teen, Billie Jean Davy, is caugh...",2.727843,"['TriStar Pictures', 'Delphi III Productions',...",['United States of America'],1985-07-19,3099497.0,96.0,Released,The Legend of Billie Jean,6.6,44.0,1985,0.00,Matthew Robbins
43624,,34000000.0,"['Action', 'Crime']",339403,en,After being coerced into working for a crime b...,228.032744,"['Big Talk Productions', 'TriStar Pictures', '...","['United Kingdom', 'United States of America']",2017-06-28,224511319.0,113.0,Released,Baby Driver,7.2,2083.0,2017,6.60,Edgar Wright
