In [19]:
import numpy as np
import pandas as pd
import re
import nltk


cols = ['title', 'overview', 'genres', 'cast', 'director']
movies = pd.read_csv("movies.csv", usecols=cols)
# print(movies.head(10))


movies

Unnamed: 0,title,overview,genres,cast,director
0,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime","Eetu Hilkamo, Turo Pajala, Jorma Markkula, Han...",Aki Kaurismäki
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance","Haije Alanoja, Aki Kaurismäki, Jukka-Pekka Pal...",Aki Kaurismäki
2,Four Rooms,It's Ted the Bellhop's first night on the job....,"Comedy, Crime","Antonio Banderas, Sammi Davis, Kimberly Blair,...","Allison Anders, Quentin Tarantino, Robert Rodr..."
3,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","Stephen Dorff, Everlast, Will Zahrn, Emilio Es...",Stephen Hopkins
4,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,,Timo Novotny
...,...,...,...,...,...
1076738,Close of Play,,,,
1076739,The Trip of The Rings - The Two Blunts,The Trip of The Rings - The Two Blunts,,,
1076740,L'Anniversaire,The first film to be made in Scotland by a com...,,,
1076741,,,,"Johannes Allmayer, Jules Waringo, Vicky Krieps...",Florian Gallenberger


In [20]:
## Process Data


## Clean up overviews
movies['clean_overview'] = movies['overview'].fillna('')

# Clean up text: Convert to lowercase, remove non-alphanumeric characters, and trim spaces
movies['clean_overview'] = movies['clean_overview'].apply(
    lambda x: re.sub(r'[^a-z0-9]', ' ', x.lower()) if isinstance(x, str) else ''
)
movies['clean_overview'] = movies['clean_overview'].apply(
    lambda x: re.sub(r'\s+', ' ', x).strip()
)

print(movies['clean_overview'].head())

0    a finnish man goes to the city to find a job a...
1    nikander a rubbish collector and would be entr...
2    it s ted the bellhop s first night on the job ...
3    four young friends while taking a shortcut en ...
4    timo novotny labels his new project an experim...
Name: clean_overview, dtype: object


In [21]:
# Tokenize Sentence

movies['clean_overview'] = movies['clean_overview'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)

# Tokenize sentences
movies['clean_overview'] = movies['clean_overview'].apply(nltk.word_tokenize)
movies

Unnamed: 0,title,overview,genres,cast,director,clean_overview
0,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime","Eetu Hilkamo, Turo Pajala, Jorma Markkula, Han...",Aki Kaurismäki,"[a, finnish, man, goes, to, the, city, to, fin..."
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance","Haije Alanoja, Aki Kaurismäki, Jukka-Pekka Pal...",Aki Kaurismäki,"[nikander, a, rubbish, collector, and, would, ..."
2,Four Rooms,It's Ted the Bellhop's first night on the job....,"Comedy, Crime","Antonio Banderas, Sammi Davis, Kimberly Blair,...","Allison Anders, Quentin Tarantino, Robert Rodr...","[it, s, ted, the, bellhop, s, first, night, on..."
3,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","Stephen Dorff, Everlast, Will Zahrn, Emilio Es...",Stephen Hopkins,"[four, young, friends, while, taking, a, short..."
4,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,,Timo Novotny,"[timo, novotny, labels, his, new, project, an,..."
...,...,...,...,...,...,...
1076738,Close of Play,,,,,[]
1076739,The Trip of The Rings - The Two Blunts,The Trip of The Rings - The Two Blunts,,,,"[the, trip, of, the, rings, the, two, blunts]"
1076740,L'Anniversaire,The first film to be made in Scotland by a com...,,,,"[the, first, film, to, be, made, in, scotland,..."
1076741,,,,"Johannes Allmayer, Jules Waringo, Vicky Krieps...",Florian Gallenberger,[]


In [22]:
# Remove Stop Words

# Ensure no extra appends
stop_words = nltk.corpus.stopwords.words('english')
overview = []

for sentence in movies['clean_overview']:
    temp = []
    for word in sentence:
        if word not in stop_words or len(word) >= 3:
            temp.append(word)
    overview.append(temp)  


In [23]:
movies['clean_overview'] = overview
overview

[['finnish',
  'man',
  'goes',
  'the',
  'city',
  'find',
  'job',
  'after',
  'the',
  'mine',
  'where',
  'worked',
  'closed',
  'and',
  'his',
  'father',
  'commits',
  'suicide'],
 ['nikander',
  'rubbish',
  'collector',
  'and',
  'would',
  'entrepreneur',
  'finds',
  'his',
  'plans',
  'for',
  'success',
  'dashed',
  'when',
  'his',
  'business',
  'associate',
  'dies',
  'one',
  'evening',
  'meets',
  'ilona',
  'down',
  'her',
  'luck',
  'cashier',
  'local',
  'supermarket',
  'falteringly',
  'bond',
  'begins',
  'develop',
  'between',
  'them'],
 ['ted',
  'the',
  'bellhop',
  'first',
  'night',
  'the',
  'job',
  'and',
  'the',
  'hotel',
  'very',
  'unusual',
  'guests',
  'are',
  'about',
  'place',
  'him',
  'some',
  'outrageous',
  'predicaments',
  'seems',
  'that',
  'this',
  'evening',
  'room',
  'service',
  'serving',
  'one',
  'unbelievable',
  'happening',
  'after',
  'another'],
 ['four',
  'young',
  'friends',
  'while',
  't

In [24]:
movies['Genres'] = movies['genres'].apply(
    lambda x: [item.strip() for item in x] if isinstance(x, list) else x.split(',') if isinstance(x, str) else []
)

movies['Actors'] = movies['cast'].apply(
    lambda x: [actor.strip() for actor in x.split(',')[:4]] if isinstance(x, str) else x if isinstance(x, list) else []
)

movies['Director'] = movies['director'].apply(
    lambda x: [director.strip() for director in x.split(',')] if isinstance(x, str) else x if isinstance(x, list) else []
)

movies


Unnamed: 0,title,overview,genres,cast,director,clean_overview,Genres,Actors,Director
0,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime","Eetu Hilkamo, Turo Pajala, Jorma Markkula, Han...",Aki Kaurismäki,"[finnish, man, goes, the, city, find, job, aft...","[Comedy, Drama, Romance, Crime]","[Eetu Hilkamo, Turo Pajala, Jorma Markkula, Ha...",[Aki Kaurismäki]
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance","Haije Alanoja, Aki Kaurismäki, Jukka-Pekka Pal...",Aki Kaurismäki,"[nikander, rubbish, collector, and, would, ent...","[Comedy, Drama, Romance]","[Haije Alanoja, Aki Kaurismäki, Jukka-Pekka Pa...",[Aki Kaurismäki]
2,Four Rooms,It's Ted the Bellhop's first night on the job....,"Comedy, Crime","Antonio Banderas, Sammi Davis, Kimberly Blair,...","Allison Anders, Quentin Tarantino, Robert Rodr...","[ted, the, bellhop, first, night, the, job, an...","[Comedy, Crime]","[Antonio Banderas, Sammi Davis, Kimberly Blair...","[Allison Anders, Quentin Tarantino, Robert Rod..."
3,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","Stephen Dorff, Everlast, Will Zahrn, Emilio Es...",Stephen Hopkins,"[four, young, friends, while, taking, shortcut...","[Action, Crime, Thriller]","[Stephen Dorff, Everlast, Will Zahrn, Emilio E...",[Stephen Hopkins]
4,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,,Timo Novotny,"[timo, novotny, labels, his, new, project, exp...",[Documentary],[],[Timo Novotny]
...,...,...,...,...,...,...,...,...,...
1076738,Close of Play,,,,,[],[],[],[]
1076739,The Trip of The Rings - The Two Blunts,The Trip of The Rings - The Two Blunts,,,,"[the, trip, the, rings, the, two, blunts]",[],[],[]
1076740,L'Anniversaire,The first film to be made in Scotland by a com...,,,,"[the, first, film, made, scotland, company, fr...",[],[],[]
1076741,,,,"Johannes Allmayer, Jules Waringo, Vicky Krieps...",Florian Gallenberger,[],[],"[Johannes Allmayer, Jules Waringo, Vicky Kriep...",[Florian Gallenberger]


In [25]:
def clean(sentence):
    # If sentence is not a list or string, return an empty list
    if not isinstance(sentence, (list, str)):
        return []
    # If it's a string, split it into words (or wrap it in a list if that's what you need)
    if isinstance(sentence, str):
        sentence = sentence.split(',')
    temp = []
    for word in sentence:
        temp.append(word.lower().replace(' ', ''))
    return temp

In [26]:
movies['Genres'] = [clean(x) for x in movies['Genres']]
movies['cast'] = [clean(x) for x in movies['cast']]
movies['Director'] = [clean(x) for x in movies['Director']]
movies

Unnamed: 0,title,overview,genres,cast,director,clean_overview,Genres,Actors,Director
0,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime","[eetuhilkamo, turopajala, jormamarkkula, hannu...",Aki Kaurismäki,"[finnish, man, goes, the, city, find, job, aft...","[comedy, drama, romance, crime]","[Eetu Hilkamo, Turo Pajala, Jorma Markkula, Ha...",[akikaurismäki]
1,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance","[haijealanoja, akikaurismäki, jukka-pekkapalo,...",Aki Kaurismäki,"[nikander, rubbish, collector, and, would, ent...","[comedy, drama, romance]","[Haije Alanoja, Aki Kaurismäki, Jukka-Pekka Pa...",[akikaurismäki]
2,Four Rooms,It's Ted the Bellhop's first night on the job....,"Comedy, Crime","[antoniobanderas, sammidavis, kimberlyblair, p...","Allison Anders, Quentin Tarantino, Robert Rodr...","[ted, the, bellhop, first, night, the, job, an...","[comedy, crime]","[Antonio Banderas, Sammi Davis, Kimberly Blair...","[allisonanders, quentintarantino, robertrodrig..."
3,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","[stephendorff, everlast, willzahrn, emilioeste...",Stephen Hopkins,"[four, young, friends, while, taking, shortcut...","[action, crime, thriller]","[Stephen Dorff, Everlast, Will Zahrn, Emilio E...",[stephenhopkins]
4,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,[],Timo Novotny,"[timo, novotny, labels, his, new, project, exp...",[documentary],[],[timonovotny]
...,...,...,...,...,...,...,...,...,...
1076738,Close of Play,,,[],,[],[],[],[]
1076739,The Trip of The Rings - The Two Blunts,The Trip of The Rings - The Two Blunts,,[],,"[the, trip, the, rings, the, two, blunts]",[],[],[]
1076740,L'Anniversaire,The first film to be made in Scotland by a com...,,[],,"[the, first, film, made, scotland, company, fr...",[],[],[]
1076741,,,,"[johannesallmayer, juleswaringo, vickykrieps, ...",Florian Gallenberger,[],[],"[Johannes Allmayer, Jules Waringo, Vicky Kriep...",[floriangallenberger]


In [27]:
# Combining all the data into a single list
columns = ['clean_overview','Genres','Actors','Director']

l = []

for i in range(len(movies)):
    words =''
    for col in columns:
        words += ' '.join(movies[col][i]) + ' '
    l.append(words)
l

['finnish man goes the city find job after the mine where worked closed and his father commits suicide comedy drama romance crime Eetu Hilkamo Turo Pajala Jorma Markkula Hannu Kivisalo akikaurismäki ',
 'nikander rubbish collector and would entrepreneur finds his plans for success dashed when his business associate dies one evening meets ilona down her luck cashier local supermarket falteringly bond begins develop between them comedy drama romance Haije Alanoja Aki Kaurismäki Jukka-Pekka Palo Eskil Mansikka akikaurismäki ',
 'ted the bellhop first night the job and the hotel very unusual guests are about place him some outrageous predicaments seems that this evening room service serving one unbelievable happening after another comedy crime Antonio Banderas Sammi Davis Kimberly Blair Paul Skemp allisonanders quentintarantino robertrodriguez alexandrerockwell ',
 'four young friends while taking shortcut en route local boxing match witness brutal murder which leaves them running for thei

In [28]:
movies['clean_overview'] = l
movies = movies[['title','clean_overview']]
movies.head()

Unnamed: 0,title,clean_overview
0,Ariel,finnish man goes the city find job after the m...
1,Shadows in Paradise,nikander rubbish collector and would entrepren...
2,Four Rooms,ted the bellhop first night the job and the ho...
3,Judgment Night,four young friends while taking shortcut en ro...
4,Life in Loops (A Megacities RMX),timo novotny labels his new project experiment...


In [30]:
##  Feature Extraction

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
features = tfidf.fit_transform(movies['clean_overview'])

In [None]:
# Similarity Matrix using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(features,features)
print(similarity)

In [31]:
## Movie Recommendation

index = pd.Series(movies['title'])
index.head(5)

0                               Ariel
1                 Shadows in Paradise
2                          Four Rooms
3                      Judgment Night
4    Life in Loops (A Megacities RMX)
Name: title, dtype: object

In [36]:
## Recommendation Function

def recommend_movies(title):
    title = title.lower() 
    matching_indices = index[index.str.lower() == title].index  # Match the title in lowercase

    if len(matching_indices) == 0:
        print(f"Movie '{title}' not found in the dataset.")
        return []

    idx = matching_indices[0]
    print(f"Found movie at index {idx}.")

    # Compute similarity scores
    score = pd.Series(similarity[idx]).sort_values(ascending=False)

    # Get top 10 recommended movie indices (excluding the input movie itself)
    top10_indices = score.index[1:11]  # Exclude the first index (the input movie itself)

    # Retrieve movie titles for recommendations
    recommended_movies = []
    for i in top10_indices:
        recommended_movies.append(movies['Original Title'][i])

    return recommended_movies

In [None]:
recommend_movies('red one')


NameError: name 'index' is not defined