In [36]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

In [37]:
sampled_movies = pd.read_csv('dataset.csv')

In [38]:
sampled_movies = sampled_movies.sort_values(by='vote_average', ascending=False)

In [39]:
sampled_movies = sampled_movies.head(500)

In [40]:
sampled_movies.to_csv('sampled_movies_dataset.csv', index=False)

In [41]:
movies = pd.read_csv('sampled_movies_dataset.csv')

### Exploratory data analysis

In [42]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
2,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [43]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 500 non-null    int64  
 1   title              500 non-null    object 
 2   genre              500 non-null    object 
 3   original_language  500 non-null    object 
 4   overview           500 non-null    object 
 5   popularity         500 non-null    float64
 6   release_date       500 non-null    object 
 7   vote_average       500 non-null    float64
 8   vote_count         500 non-null    int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 35.3+ KB


In [44]:
movies.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,500.0,500.0,500.0,500.0
mean,189575.456,79.103228,8.105,4189.292
std,250136.372532,424.77371,0.182926,5760.006076
min,11.0,0.6,7.9,200.0
25%,899.5,11.1705,8.0,471.0
50%,21137.5,20.2565,8.1,1348.0
75%,400688.0,48.94,8.2,5859.0
max,920394.0,7567.017,8.7,31917.0


In [45]:
movies.isnull().sum()

id                   0
title                0
genre                0
original_language    0
overview             0
popularity           0
release_date         0
vote_average         0
vote_count           0
dtype: int64

In [46]:
movies = movies.fillna(" ")

### Feature selection

In [47]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [48]:
movies = movies[['id', 'title', 'genre', 'overview']]

In [49]:
movies

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
2,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...
...,...,...,...,...
495,552532,Charm City Kings,Drama,Mouse desperately wants to join The Midnight C...
496,316029,The Greatest Showman,Drama,"The story of American showman P.T. Barnum, fou..."
497,9323,Ghost in the Shell,"Action,Animation,Science Fiction","In the year 2029, the barriers of our world ha..."
498,223,Rebecca,"Mystery,Drama,Thriller,Romance",Story of a young woman who marries a fascinati...


In [50]:
movies['tags'] = movies['overview'] + movies['genre']

In [51]:
movies

Unnamed: 0,id,title,genre,overview,tags
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,Framed in the 1940s for the double murder of h...
1,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...","Spanning the years 1945 to 1955, a chronicle o..."
2,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...","Raj is a rich, carefree, happy-go-lucky second..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,In the continuing saga of the Corleone crime f...
...,...,...,...,...,...
495,552532,Charm City Kings,Drama,Mouse desperately wants to join The Midnight C...,Mouse desperately wants to join The Midnight C...
496,316029,The Greatest Showman,Drama,"The story of American showman P.T. Barnum, fou...","The story of American showman P.T. Barnum, fou..."
497,9323,Ghost in the Shell,"Action,Animation,Science Fiction","In the year 2029, the barriers of our world ha...","In the year 2029, the barriers of our world ha..."
498,223,Rebecca,"Mystery,Drama,Thriller,Romance",Story of a young woman who marries a fascinati...,Story of a young woman who marries a fascinati...


In [52]:
new_data = movies.drop(columns=['genre', 'overview'])

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
cv = CountVectorizer(max_features = 10000, stop_words = 'english')

In [55]:
cv

In [56]:
vector = cv.fit_transform(new_data['tags'].values.astype('U')).toarray()

In [57]:
vector.shape

(500, 5651)

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
similarity = cosine_similarity(vector)

In [60]:
new_data[new_data['title']=="The Godfather"].index[0]

1

In [61]:
distance = sorted(list(enumerate(similarity[2])), reverse = True, key = lambda vector:vector[1])
for i in distance[0:5]:
    print(new_data.iloc[i[0]].title)

Dilwale Dulhania Le Jayenge
The Cameraman
Bajrangi Bhaijaan
City Lights
The Shop Around the Corner


In [62]:
def recommend(movie_title):
    # Use fuzzywuzzy to find the closest match
    closest_match = process.extractOne(movie_title, new_data['title'].values)
    
    if closest_match is None:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return
    
    closest_title = closest_match[0]
    index = new_data[new_data['title'] == closest_title].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    print(f"Movies similar to '{closest_title}':")
    for i in distances[1:6]:  # Starting from 1 to exclude the movie itself
        print(new_data.iloc[i[0]].title)

In [63]:
recommend("iron mann")

Movies similar to 'Big Deal on Madonna Street':
Lock, Stock and Two Smoking Barrels
Tel chi el telùn
Le Trou
Perfect Strangers
Some Like It Hot


In [64]:
import pickle

In [65]:
pickle.dump(new_data, open('movies_list.pkl', 'wb'))

In [66]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [67]:
pickle.load(open('movies_list.pkl', 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
495,552532,Charm City Kings,Mouse desperately wants to join The Midnight C...
496,316029,The Greatest Showman,"The story of American showman P.T. Barnum, fou..."
497,9323,Ghost in the Shell,"In the year 2029, the barriers of our world ha..."
498,223,Rebecca,Story of a young woman who marries a fascinati...


In [68]:
!pip install streamlit

