# **Loading Data**

In [137]:
import pandas as pd

In [138]:
# df = pd.read_csv("./2022 Movies Cleaned.csv")
df = pd.read_excel("data.xls")

In [139]:
df.head()

Unnamed: 0,Title,rank,Year,certificate,Time,genre,inlineblock,Score,textmuted,director,actor1,actor2,actor3,actor4
0,Avatar: The Way of Water,1,-2022,PG-13,192 min,"\nAction, Adventure, Fantasy",7.8,\n67 \n Metascore\n,\nJake Sully lives with his newfound family fo...,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang
1,The Menu,2,-2022,R,107 min,"\nHorror, Thriller",7.2,\n71 \n Metascore\n,\nA young couple travels to a remote island to...,Mark Mylod,Ralph Fiennes,Anya Taylor-Joy,Nicholas Hoult,Hong Chau
2,Babylon,3,(I) (2022),R,189 min,"\nComedy, Drama, History",7.5,\n60 \n Metascore\n,\nA tale of outsized ambition and outrageous e...,Damien Chazelle,Brad Pitt,Margot Robbie,Jean Smart,Olivia Wilde
3,Everything Everywhere All at Once,4,-2022,R,139 min,"\nAction, Adventure, Comedy",8.0,\n81 \n Metascore\n,\nA middle-aged Chinese immigrant is swept up ...,Dan Kwan,Daniel Scheinert,Michelle Yeoh,Stephanie Hsu,Jamie Lee Curtis
4,M3GAN,5,-2022,PG-13,102 min,"\nHorror, Sci-Fi, Thriller",6.4,\n72 \n Metascore\n,\nA robotics engineer at a toy company builds ...,Gerard Johnstone,Allison Williams,Violet McGraw,Ronny Chieng,Amie Donald


# **Preprocessing**

### Basic Cleaning

In [140]:
# Removing Duplicates
df.drop_duplicates(inplace=True)

In [141]:
# filling NULL values
print(df.isna().sum())
df.fillna("", inplace=True)

Title             0
rank             16
Year             39
certificate    8995
Time           2903
genre           391
inlineblock    3733
Score          9556
textmuted        35
director        127
actor1          333
actor2          526
actor3          822
actor4         1266
dtype: int64


In [142]:
df = df.astype(str)

### Genres

In [143]:
unique_genres = [
    "Action", "Adventure", "Fantasy", "Horror", "Thriller",
    "Comedy", "Drama", "History", "Sci-Fi", "Animation",
    "Crime", "Mystery", "Music", "Romance", "War", "Biography",
    "Family", "Sport", "Musical", "Western", "Reality-TV", "News",
    "Game-Show", "Talk-Show"
]

In [144]:
def genres_clean(row):
    return " ".join([genre.strip() for genre in row.split(",") if genre.strip() in unique_genres]).lower().replace("-", "")

In [145]:
genres_clean(df.genre[30])

'action crime drama'

In [146]:
df.iloc[1714]

Title                                         #69 Samskar Colony
rank           https://www.imdb.com/title/tt18673736/?ref_=ad...
Year           https://m.media-amazon.com/images/S/sash/4Fyxw...
certificate    https://www.imdb.com/title/tt18673736/?ref_=ad...
Time                                                        1715
genre                                                      -2022
inlineblock                                                     
Score                                                    126 min
textmuted                               \nRomance            """
director       \nKoushik, a teenage boy, moves to the city wi...
actor1         \n    Director:\nSuneel Kumar Reddy\n         ...
actor2         https://www.imdb.com/name/nm6190049/?ref_=adv_...
actor3                                        Suneel Kumar Reddy
actor4         https://www.imdb.com/name/nm2496992/?ref_=adv_...
Name: 1714, dtype: object

In [147]:
js = []
j = 0
for i in df.genre:
    # print(j)
    try:
        genres_clean(i)
    except:
        js.append(j)
    j += 1

In [148]:
df.iloc[js]

Unnamed: 0,Title,rank,Year,certificate,Time,genre,inlineblock,Score,textmuted,director,actor1,actor2,actor3,actor4


In [149]:
df.genre = df.genre.apply(genres_clean)

In [150]:
df.head()

Unnamed: 0,Title,rank,Year,certificate,Time,genre,inlineblock,Score,textmuted,director,actor1,actor2,actor3,actor4
0,Avatar: The Way of Water,1,-2022,PG-13,192 min,action adventure fantasy,7.8,\n67 \n Metascore\n,\nJake Sully lives with his newfound family fo...,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang
1,The Menu,2,-2022,R,107 min,horror thriller,7.2,\n71 \n Metascore\n,\nA young couple travels to a remote island to...,Mark Mylod,Ralph Fiennes,Anya Taylor-Joy,Nicholas Hoult,Hong Chau
2,Babylon,3,(I) (2022),R,189 min,comedy drama history,7.5,\n60 \n Metascore\n,\nA tale of outsized ambition and outrageous e...,Damien Chazelle,Brad Pitt,Margot Robbie,Jean Smart,Olivia Wilde
3,Everything Everywhere All at Once,4,-2022,R,139 min,action adventure comedy,8.0,\n81 \n Metascore\n,\nA middle-aged Chinese immigrant is swept up ...,Dan Kwan,Daniel Scheinert,Michelle Yeoh,Stephanie Hsu,Jamie Lee Curtis
4,M3GAN,5,-2022,PG-13,102 min,horror scifi thriller,6.4,\n72 \n Metascore\n,\nA robotics engineer at a toy company builds ...,Gerard Johnstone,Allison Williams,Violet McGraw,Ronny Chieng,Amie Donald


### Cast

In [151]:
def clean_cast(row):
    alpha = "abcdefghijklmnopqrstuvwxyz"
    new_row = ""
    for char in row.lower():
        if char in alpha:
            new_row += char
    return new_row

df.director = df.director.apply(clean_cast)
df.actor1 = df.actor1.apply(clean_cast)
df.actor2 = df.actor2.apply(clean_cast)
df.actor3 = df.actor3.apply(clean_cast)
df.actor4 = df.actor4.apply(clean_cast)

In [152]:
cast = df.director + " " + df.actor1 + " " + df.actor2 + " " + df.actor3 + " " + df.actor4

In [153]:
df['cast'] = cast

In [154]:
df.head()

Unnamed: 0,Title,rank,Year,certificate,Time,genre,inlineblock,Score,textmuted,director,actor1,actor2,actor3,actor4,cast
0,Avatar: The Way of Water,1,-2022,PG-13,192 min,action adventure fantasy,7.8,\n67 \n Metascore\n,\nJake Sully lives with his newfound family fo...,jamescameron,samworthington,zoesaldana,sigourneyweaver,stephenlang,jamescameron samworthington zoesaldana sigourn...
1,The Menu,2,-2022,R,107 min,horror thriller,7.2,\n71 \n Metascore\n,\nA young couple travels to a remote island to...,markmylod,ralphfiennes,anyataylorjoy,nicholashoult,hongchau,markmylod ralphfiennes anyataylorjoy nicholash...
2,Babylon,3,(I) (2022),R,189 min,comedy drama history,7.5,\n60 \n Metascore\n,\nA tale of outsized ambition and outrageous e...,damienchazelle,bradpitt,margotrobbie,jeansmart,oliviawilde,damienchazelle bradpitt margotrobbie jeansmart...
3,Everything Everywhere All at Once,4,-2022,R,139 min,action adventure comedy,8.0,\n81 \n Metascore\n,\nA middle-aged Chinese immigrant is swept up ...,dankwan,danielscheinert,michelleyeoh,stephaniehsu,jamieleecurtis,dankwan danielscheinert michelleyeoh stephanie...
4,M3GAN,5,-2022,PG-13,102 min,horror scifi thriller,6.4,\n72 \n Metascore\n,\nA robotics engineer at a toy company builds ...,gerardjohnstone,allisonwilliams,violetmcgraw,ronnychieng,amiedonald,gerardjohnstone allisonwilliams violetmcgraw r...


### Context

In [155]:
df.textmuted

0        \nJake Sully lives with his newfound family fo...
1        \nA young couple travels to a remote island to...
2        \nA tale of outsized ambition and outrageous e...
3        \nA middle-aged Chinese immigrant is swept up ...
4        \nA robotics engineer at a toy company builds ...
                               ...                        
10041    \nDon't miss any of the action, strategy, and ...
10042                               \n        Add a Plot\n
10043                               \n        Add a Plot\n
10044    \nDeep in the work of resolving the phobias an...
10045                               \n        Add a Plot\n
Name: textmuted, Length: 10033, dtype: object

In [156]:
import re
from textblob import TextBlob
from nltk.corpus import stopwords
import string

def clean_context(row, spell, stopword):
	alpha = "abcdefghijklmnopqrstuvwxyz"

	# lowercasing
	row = row.lower()

	# removing HTML tags
	pattern = re.compile('<.*?>')
	row =  pattern.sub(r'', row)

	# Removing URLS
	pattern = re.compile(r'https?://\S+|www\.\S+')
	row = pattern.sub(r'', row)

	# Remove Punctuations
	row = row.translate(str.maketrans("", "", string.punctuation))

	# Spell Correction
	if spell:
		textBlb = TextBlob(row)
		row = textBlb.correct().string

	new_words = []
	for word in row.lower().split():
		new_word = ""
		for char in word:
			if char in alpha:
				new_word += char
		
		if not stopword or new_word not in stopwords.words("english"):
			new_words.append(new_word)
	return " ".join(new_words)


import json

# from tqdm import tqdm
# context = []
# for row in tqdm(df.textmuted):
# 	context.append(clean_context(row, spell=False, stopword=True))
# with open("context.json", 'w') as f:
#     json.dump(context, f)

with open("context.json", 'rb') as f:
	context = json.load(f)

In [157]:
df['context'] = context

### Combining Columns

In [158]:
df.head()

Unnamed: 0,Title,rank,Year,certificate,Time,genre,inlineblock,Score,textmuted,director,actor1,actor2,actor3,actor4,cast,context
0,Avatar: The Way of Water,1,-2022,PG-13,192 min,action adventure fantasy,7.8,\n67 \n Metascore\n,\nJake Sully lives with his newfound family fo...,jamescameron,samworthington,zoesaldana,sigourneyweaver,stephenlang,jamescameron samworthington zoesaldana sigourn...,jake sully lives newfound family formed extras...
1,The Menu,2,-2022,R,107 min,horror thriller,7.2,\n71 \n Metascore\n,\nA young couple travels to a remote island to...,markmylod,ralphfiennes,anyataylorjoy,nicholashoult,hongchau,markmylod ralphfiennes anyataylorjoy nicholash...,young couple travels remote island eat exclusi...
2,Babylon,3,(I) (2022),R,189 min,comedy drama history,7.5,\n60 \n Metascore\n,\nA tale of outsized ambition and outrageous e...,damienchazelle,bradpitt,margotrobbie,jeansmart,oliviawilde,damienchazelle bradpitt margotrobbie jeansmart...,tale outsized ambition outrageous excess trace...
3,Everything Everywhere All at Once,4,-2022,R,139 min,action adventure comedy,8.0,\n81 \n Metascore\n,\nA middle-aged Chinese immigrant is swept up ...,dankwan,danielscheinert,michelleyeoh,stephaniehsu,jamieleecurtis,dankwan danielscheinert michelleyeoh stephanie...,middleaged chinese immigrant swept insane adve...
4,M3GAN,5,-2022,PG-13,102 min,horror scifi thriller,6.4,\n72 \n Metascore\n,\nA robotics engineer at a toy company builds ...,gerardjohnstone,allisonwilliams,violetmcgraw,ronnychieng,amiedonald,gerardjohnstone allisonwilliams violetmcgraw r...,robotics engineer toy company builds lifelike ...


' tomcruse meganfox tomcruse meganfox'

In [183]:
cols = ['genre', 'cast', 'context']
WEGIHTS = {
    'genre': 1,
    'cast': 1,
    'context': 1,
}

df['combined'] = ""

for col in cols:
    df['combined'] += (" " + df[col]) * WEGIHTS[col]

In [184]:
titles = df.Title.values.tolist()
content = df.combined.values.tolist()

# **Vectorizing**

In [185]:
MAX_FEATURES = None

In [186]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [187]:
vectorizers = {}

In [212]:
# CountVectorizer
vectorizers['bow'] = CountVectorizer(
    lowercase=True,
    # stop_words='english',	# removing irrelevant common words
    max_features=MAX_FEATURES,		# avoiding less frequent words
    binary=False,
    ngram_range=(1, 1),     # N-gram: (min, max)
)
# N-gram
vectorizers['ngram'] = CountVectorizer(
    lowercase=True,
    # stop_words='english',	# removing irrelevant common words
    max_features=MAX_FEATURES,		# avoiding less frequent words
    binary=False,
    ngram_range=(2, 3),     # N-gram: (min, max)
)
# TfidfVectorizer
vectorizers['tfidf'] = TfidfVectorizer(
    max_features = MAX_FEATURES,
)

In [213]:
from sklearn.metrics.pairwise import cosine_similarity

sims = {}
for key, vectorizer in vectorizers.items():
    print(key, end=" ")
    transformed = vectorizer.fit_transform(content)
    print(transformed.shape)
    sims[key] = cosine_similarity(transformed)

bow (10033, 61253)
tfidf (10033, 61253)
ngram (10033, 349214)


# **Testing**

### Movie From Movie

In [214]:
import random

"     ".join(random.choices((titles), k=100))

"Adam Heatherly's Frankenstein!     Limit     Coffee with Kadhal     Maia     Bem-vinda a Quixeramobim     The Takeover     F*ck Love Too     Cirkus Maximum     TÍU     Parahuna 2     Deadly Parasite     \ufeffGila Gusti     Unlocked     Dear Friend     Contigo Voy     By Deception     Unplayed Lullaby     Leon - Glaub nicht alles was du siehst     Three of Us     Love Suddenly     Chiqui     69 Parts     Spare Keys     Selfish     #DoYouThinkIAmSexy     Saturday Night     Greenhouse     The Consultant     Note Bandi     Habangbuhay     Scorched Earth     Moonfall     No Better Love     Hollow's Ridge     Bones and All     Amor y matemáticas     Bajre Da Sitta     We Haven't Lost Our Way     Trip to Datana     Goodbye     Jikirag     The Nannies     Hometown     Sosefina     Aftersun     The Evil John Krackers     Kazi     Rowdy Boys     Batomen     Women of Theatre, New York     The Obscured     Varalaru Mukkiyam     Deep Astronomy and the Romantic Sciences     Vikram     Revealer    

In [220]:
# title = random.choice(titles)
title = 'Black Panther: Wakanda Forever'
recs = 10
print(title)

# i = data.Title[data.Title == title].index[0]
i = titles.index(title)

recs_titles_count = [i[0] for i in sorted(enumerate(sims['bow'][i]), key=lambda x: x[1], reverse=True)][:recs+1]
recs_titles_gram = [i[0] for i in sorted(enumerate(sims['ngram'][i]), key=lambda x: x[1], reverse=True)][:recs+1]
recs_titles_tfidf = [i[0] for i in sorted(enumerate(sims['tfidf'][i]), key=lambda x: x[1], reverse=True)][:recs+1]

Black Panther: Wakanda Forever


In [221]:
df1 = df.iloc[recs_titles_count][['Title', 'genre']]
df2 = df.iloc[recs_titles_gram][['Title', 'genre']]
df3 = df.iloc[recs_titles_tfidf][['Title', 'genre']]
pd.concat([df1, df2, df3])

Unnamed: 0,Title,genre
28,Black Panther: Wakanda Forever,action adventure drama
7851,Wolfoo the Adventurer 2,action adventure drama
2127,Satria Dewa: Gatotkaca,action adventure drama
6385,Dear Death,drama
4715,#Homecoming,
7698,Escape the City,action adventure drama
7647,Singh vs Kaur 2,action adventure drama
751,Bubble,animation action adventure
6771,Promise Chronicles - Manifestation,action adventure drama
5601,Cybernetic Genesis - La guerra tra i due mondi,action fantasy scifi


# **Saving**

In [266]:
import json

In [267]:
with open("data.json", 'rb') as f:
    books = json.load(f)

with open("movies.json", 'rb') as f:
    movies = json.load(f)

books.keys()

dict_keys(['imgs', 'infos', 'recs'])

In [268]:
books['infos'][0]

{'Genre': 'Action Adventure Fantasy',
 'Director': 'James Cameron',
 'Cast': 'Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang',
 'Context': "Jake Sully lives with his newfound family formed on the extrasolar moon Pandora. Once a familiar threat returns to finish what was previously started, Jake must work with Neytiri and the army of the Na'vi race to protect their home."}

In [222]:
orig = pd.read_csv("./2022 Movies Cleaned.csv")
orig.drop_duplicates(inplace=True)
orig.fillna("", inplace=True)
orig = orig.astype(str)

In [223]:
orig.genre = orig.genre.apply(genres_clean)

In [224]:
orig.textmuted = orig.textmuted.apply(lambda x: x.strip())

In [225]:
orig['Actors'] = orig['actor1'] + "_" + orig['actor2'] + "_" + orig['actor3'] + "_" + orig['actor4']

In [226]:
orig = orig[['genre', 'director', 'Actors', 'textmuted']]

In [227]:
orig.head()

Unnamed: 0,genre,director,Actors,textmuted
0,action adventure fantasy,James Cameron,Sam Worthington_Zoe Saldana_Sigourney Weaver_S...,Jake Sully lives with his newfound family form...
1,horror thriller,Mark Mylod,Ralph Fiennes_Anya Taylor-Joy_Nicholas Hoult_H...,A young couple travels to a remote island to e...
2,comedy drama history,Damien Chazelle,Brad Pitt_Margot Robbie_Jean Smart_Olivia Wilde,A tale of outsized ambition and outrageous exc...
3,action adventure comedy,Dan Kwan,Daniel Scheinert_Michelle Yeoh_Stephanie Hsu_J...,A middle-aged Chinese immigrant is swept up in...
4,horror scifi thriller,Gerard Johnstone,Allison Williams_Violet McGraw_Ronny Chieng_Am...,A robotics engineer at a toy company builds a ...


In [228]:
infos = []
for row in orig.values:
    genre, director, actor, context = row

    genre = " ".join([i.capitalize() for i in genre.split()])
    actor = actor.replace("_", ", ")
    
    context = " ".join(context.split())

    infos.append({
        "Genre": genre,
        "Director": director,
        "Cast": actor,
        "Context": context,
    })

In [229]:
from tqdm import tqdm

In [230]:
sim = sims['tfidf']

In [231]:
sim = (sim*100).round(2)

In [239]:
recs = []
for row in tqdm(sim):
    rec = sorted(list(enumerate(row)), key=lambda x: x[1], reverse=True)[1:25]
    recs.append(rec)

100%|██████████| 10033/10033 [01:00<00:00, 165.66it/s]


In [240]:
import numpy as np
np.array(recs).shape

(10033, 24, 2)

In [241]:
movies_2022 = {}
movies_2022['titles'] = titles
movies_2022['imgs'] = None
movies_2022['infos'] = infos
movies_2022['recs'] = recs

In [242]:
with open("data.json", 'w') as f:
    json.dump(movies_2022, f)

In [237]:
with open("titles.json", 'w') as f:
    json.dump(titles, f)