In [39]:
import pandas as pd
import numpy as np
import re
import string
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel


In [40]:
df = pd.read_parquet('data/movies.pq')
df.sample(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45429 entries, 0 to 45428
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4495 non-null   object 
 1   budget                 45429 non-null  float64
 2   genres                 45429 non-null  object 
 3   id                     45429 non-null  int64  
 4   original_language      45418 non-null  object 
 5   overview               45429 non-null  object 
 6   popularity             45429 non-null  object 
 7   production_companies   45429 non-null  object 
 8   production_countries   45429 non-null  object 
 9   release_date           45429 non-null  object 
 10  revenue                45429 non-null  float64
 11  runtime                45183 non-null  float64
 12  spoken_languages       45429 non-null  object 
 13  status                 45349 non-null  object 
 14  tagline                20416 non-null  object 
 15  ti

In [44]:

df['production_countries'].info()
df['United States of America' in df['production_countries']].shape

<class 'pandas.core.series.Series'>
Int64Index: 45429 entries, 0 to 45428
Series name: production_countries
Non-Null Count  Dtype 
--------------  ----- 
45429 non-null  object
dtypes: object(1)
memory usage: 709.8+ KB


KeyError: False

In [13]:
def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    text = str(text).lower().replace('collection','')
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

df['belongs_to_collection'].fillna('', inplace=True)
q = df['title'] + " " + df['overview'] + " " + df['genres'] + ' ' + df['belongs_to_collection']

#q = df['title'] +  " " + df['genres'] + ' ' + df['belongs_to_collection']
q  = q.apply(lambda x:clean_text(x))


In [14]:
q

0        toy story led by woody andys toys live happily...
1        jumanji when siblings judy and peter discover ...
2        grumpier old men a family wedding reignites th...
3        waiting to exhale cheated on mistreated and st...
4        father of the bride part ii just when george b...
                               ...                        
45424    robin hood yet another version of the classic ...
45425    century of birthing an artist struggles to fin...
45426    betrayal when one of her hits goes wrong a pro...
45427    satan triumphant in a small town live two brot...
45428    queerama 50 years after decriminalisation of h...
Length: 45429, dtype: object

In [15]:
q[11882]

'ten canoes a story within a story within a story in australias northern territory an aboriginal narrator tells a story about his ancestors on a goose hunt a youngster on the hunt is being tempted to adultery with his elder brothers wife so an elder tells him a story from the mythical past about how evil can slip in and cause havoc unless prevented by virtue according to customary tribal law adventure comedy drama '

In [16]:
#create the TF-IDF model
# MAX_DF     = 0.95
# MIN_DF     = 1#2
tfidf = TfidfVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))

content = q.dropna()[:15000]

tfidf_matrix = tfidf.fit_transform(content)


cosine_similarities = None
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# Count Vectorizer

content = q.dropna()[:15000]
count_vectorizer = CountVectorizer(token_pattern = r"\b\w{3,}\b", stop_words='english',  ngram_range=(1,2))
q = q.dropna()
count_matrix = count_vectorizer.fit_transform(content)
cosine_similarities = None
cosine_similarities = linear_kernel(count_matrix, count_matrix)



In [17]:
id_movie = 0
cosine_similarity_scores = list(enumerate(cosine_similarities[id_movie]))
cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
cosine_similarity_scores

print (df.iloc[id_movie].title)
for i in range(1, 6):
    idx = cosine_similarity_scores[i]
    print (idx, df.iloc[idx[0]].title)    


Toy Story
(3005, 0.34359414470660926) Toy Story 2
(1078, 0.06640693209089227) Rebel Without a Cause
(485, 0.06563615813973897) Malice
(10968, 0.05415495403008784) The Wild
(11433, 0.0537268354115776) For Your Consideration


In [None]:
# Importamos el lemmatizar de NLTK, y creamos el objeto
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


In [None]:
import nltk

# Esto sirve para configurar NLTK. La primera vez puede tardar un poco
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')


pp =nltk.word_tokenize(df.overview[0])
print (len(pp))
pp = [word for word in pp if word not in stopwords]
print (len(pp))
#df.overview.apply(lambda p : nltk.word_tokenize(p))



59
43


[nltk_data] Downloading package punkt to /home/ozzy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ozzy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Importamos la función que nos permite Stemmizar de nltk y definimos el stemmer
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()



# # Recorremos todos los titulos y le vamos aplicando la Normalizacion y luega el Stemming a cada uno
# for row in df:
#     titular = row.overview
#     # Vamos a reemplzar los caracteres que no sean leras por espacios
#     titular=re.sub("[^a-zA-Z]"," ",str(titular))
#     # Pasamos todo a minúsculas
#     titular=titular.lower()
#     # Tokenizamos para separar las palabras del titular
#     titular=nltk.word_tokenize(titular)
#     # Eliminamos las palabras de menos de 3 letras
#     titular = [palabra for palabra in titular if len(palabra)>3]
#     # Sacamos las Stopwords
#     titular = [palabra for palabra in titular if not palabra in stopwords]
    
#     ## Hasta acá Normalizamos, ahora a stemmizar
    
#     # Aplicamos la funcion para buscar la raiz de las palabras
#     titular=[stemmer.stem(palabra) for palabra in titular]
#     # Por ultimo volvemos a unir el titular
#     titular=" ".join(titular)
    
#     # Vamos armando una lista con todos los titulares
#     titular_list.append(titular)
#     #dataset["titular_normalizado"] = titular_list




def content(row):

    titular = row.overview
    # Vamos a reemplzar los caracteres que no sean leras por espacios
    titular=re.sub("[^a-zA-Z]"," ",str(titular))
    # Pasamos todo a minúsculas
    titular=titular.lower()
    # Tokenizamos para separar las palabras del titular
    titular=nltk.word_tokenize(titular)
    # Eliminamos las palabras de menos de 3 letras
    titular = [palabra for palabra in titular if len(palabra)>3]
    # Sacamos las Stopwords
    titular = [palabra for palabra in titular if not palabra in stopwords]
    
    ## Hasta acá Normalizamos, ahora a stemmizar
    
    # Aplicamos la funcion para buscar la raiz de las palabras
    titular=[stemmer.stem(palabra) for palabra in titular]
    # Por ultimo volvemos a unir el titular
    titular= row.title.lower() + " " + " ".join(titular)

    return titular

df['content'] = df.apply(content, axis=1)

In [None]:
df.title.describe()

count        45451
unique       42195
top       Blackout
freq            13
Name: title, dtype: object

In [None]:
from datetime import datetime
#create the TF-IDF model
MAX_DF     = 0.95
MIN_DF     = 1
tfidf = TfidfVectorizer(#max_df=MAX_DF, min_df=MIN_DF, \
                        #max_features=10000,\
                        ngram_range=(1,2),
                        token_pattern = r"\b\w{5,}\b")

content = df['content'].dropna()[:20000]
tfidf_matrix = tfidf.fit_transform(content)

#tfidf_matrix.shape()



In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
import numpy as np
import pyarrow as pa

np_arr = np.array([1.3, 4.22, -5], dtype=np.float32)
pa_table = pa.table({"data": cosine_similarities})
pa.parquet.write_table(pa_table, "data/cosine_similarities.parquet")


In [None]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

matrix = cosine_similarities
arrays = [
    pa.array(col)  # Create one arrow array per column
    for col in matrix
]

table = pa.Table.from_arrays(
    arrays,
    names=[str(i) for i in range(len(arrays))] # give names to each columns
)
# Save it:
pq.write_table(table, 'data/cosine_similarities.pq')

# # Read it back as numpy:
# table_from_parquet = pq.read_table('table.pq')
# matrix_from_parquet = table_from_parquet.to_pandas().T.to_numpy()

In [None]:
t = datetime.now()
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq


# Read it back as numpy:
table_from_parquet = pq.read_table('data/cosine_similarities.pq')
cosine_similarities = table_from_parquet.to_pandas().T.to_numpy()

#cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
#print (type (cosine_similarities))
print ('Tiempo : ', round((datetime.now()-t).total_seconds(), 2) )


[(46, 1.0000000000000004), (1672, 0.11158651823928598), (21, 0.11087828258439557), (476, 0.1063867805038503), (14129, 0.0998881423463085)]
Se7en 1.0000000000000004
Fallen 0.11158651823928598
Copycat 0.11087828258439557
Kalifornia 0.1063867805038503
The Cell 2 0.0998881423463085
Murder by Numbers 0.09956969395686152
Tiempo :  5.87


In [None]:
t = datetime.now()
cosine_similarity_scores = list(enumerate(cosine_similarities[0]))
cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)
print (cosine_similarity_scores[:5])

for i in cosine_similarity_scores[:6]:
    print (df.loc[i[0]].title, i[1])

print ('Tiempo : ', round((datetime.now()-t).total_seconds(), 2) )
# print (len(df.loc[123].overview))

[(0, 1.0000000000000002), (3006, 0.1846449436245275), (15406, 0.17917450140839977), (1938, 0.10480073050179864), (1039, 0.0860263851872323)]
Toy Story 1.0000000000000002
Toy Story 2 0.1846449436245275
Toy Story 3 0.17917450140839977
Condorman 0.10480073050179864
The Sunchaser 0.0860263851872323
Bound for Glory 0.07745464529937668
Tiempo :  0.02


In [None]:
df.title.loc(df.title=='The Terminator')

TypeError: unhashable type: 'Series'