# **NLP PROJECT USING MOVIES**

In [66]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [67]:
df = pd.read_csv("https://raw.githubusercontent.com/Sankha1998/TMDb_Top_Movies/master/TMDb_updated.CSV")

### **EXPLORATORY DATA ANALYSIS(EDA) ON THE DATASET**

In [68]:
print(f"There are", df.shape[0], "rows and", df.shape[1], "columns in the dataset")
df.head()
df.info()

There are 10000 rows and 6 columns in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10000 non-null  int64  
 1   title              10000 non-null  object 
 2   overview           9970 non-null   object 
 3   original_language  10000 non-null  object 
 4   vote_count         10000 non-null  int64  
 5   vote_average       10000 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 468.9+ KB


In [69]:
df.isnull().sum() #To get the missing values

Unnamed: 0            0
title                 0
overview             30
original_language     0
vote_count            0
vote_average          0
dtype: int64

In [70]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.dropna(inplace=True)

In [71]:
'''
We are building a text_based system for recommending movies based on your favorite movie
For this we will use the overview column to get a matrix of Inverse Data Frequency(IDF) features
'''
#Define a TF-IDF Vectorizer Object. Remove all english stop words
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)
vect_df = vectorizer.fit_transform(df['overview'])
vect_df.shape

(9970, 28709)

In [72]:
#Converting the vectorized_data into a DataFrame that with feature_names as columns and movie title as index
tfidf_df = pd.DataFrame(vect_df.toarray(), 
                        columns=vectorizer.get_feature_names())
tfidf_df.index = df['title']
tfidf_df.head()

Unnamed: 0_level_0,00,000,006,007,01,05pm,10,100,1000,100th,101,1021,108,10pm,10th,11,1111,1127,1138,114,117,118,1183,118th,119,11th,12,120,1200,1215,125,1250,125th,128,12th,13,1300,133,1344,1345,...,zoom,zooni,zoos,zoosters,zootopia,zorg,zorin,zoro,zorro,zoë,zucchini,zuckerberg,zuckerman,zugor,zula,zulu,zulus,zunaira,zune,zurg,zuri,zuru,zuzu,zzzax,álava,álex,ángela,åhr,åmål,æon,échard,éclair,édouard,émigré,état,étienne,öztürk,žižek,βwzvz,運轉手之戀
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Ad Astra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bloodshot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bad Boys for Life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ant-Man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Percy Jackson: Sea of Monsters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# Measure between items using cosine similarity
cos_array = cosine_similarity(tfidf_df)
# Turn the cosine similarity array into a dataframe
cos_df = pd.DataFrame(cos_array, index = tfidf_df.index, columns = tfidf_df.index)
cos_df.head()

title,Ad Astra,Bloodshot,Bad Boys for Life,Ant-Man,Percy Jackson: Sea of Monsters,Birds of Prey (and the Fantabulous Emancipation of One Harley Quinn),Live Free or Die Hard,Cold Blood,Underwater,The Platform,Jumanji: The Next Level,The Twilight Saga: Eclipse,Sonic the Hedgehog,Star Wars: The Rise of Skywalker,Onward,Emma.,Pocahontas II: Journey to a New World,Lara Croft: Tomb Raider - The Cradle of Life,The Invisible Man,Blood Father,A Rainy Day in New York,Joker,Miracle in Cell No. 7,The Hunt,Transformers: The Last Knight,Parasite,F#*@BOIS,Pretty Little Stalker,Dolittle,Frozen II,Harry Potter and the Deathly Hallows: Part 2,Cars,Teen Titans: The Judas Contract,Digimon Adventure: Last Evolution Kizuna,Contagion,30 Days of Night: Dark Days,The Traitor,Trolls World Tour,Harry Potter and the Philosopher's Stone,1917,...,The Enemy Below,The Incredible Shrinking Man,I Am a Fugitive from a Chain Gang,A Better Tomorrow,On a Magical Night,Killshot,Legally Blonde 3,Belle and Sebastian,Assassination Games,Ballad of a Soldier,The Male Gaze: The Heat of the Night,Victor Crowley,Into the Abyss,The Chase,Foxtrot,Sundays at Tiffany's,Crown Vic,Irumbu Kottai Murattu Singam,14 Blades,I Am Fear,Lola,Three Kingdoms: Resurrection of the Dragon,Bliss Point,Dead Birds,End Game,The Diabolical,Cool Kids Don't Cry,About Alex,The Freshman,Beyond The Sky,Attack on Titan,Pokémon: The Rise of Darkrai,Eagle vs Shark,High Flying Bird,Zapped!,Cargo,The Good Night,The World Is Yours,The Grand Seduction,Woochi: The Demon Slayer
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Ad Astra,1.0,0.005631,0.0,0.0,0.0,0.0,0.011681,0.0,0.047379,0.041855,0.035766,0.015106,0.030336,0.0,0.0,0.0,0.015729,0.014768,0.007483,0.0,0.0,0.008912,0.0,0.0,0.040507,0.0,0.0,0.0,0.012319,0.031173,0.020469,0.004946,0.021413,0.041259,0.0,0.0,0.005967,0.009684,0.022791,0.0,...,0.0,0.0,0.0,0.0,0.01774,0.0,0.0,0.0,0.0,0.01718,0.0,0.0,0.01866,0.0,0.0,0.019239,0.004563,0.0,0.015536,0.0,0.0,0.016937,0.011257,0.0,0.008371,0.0,0.009715,0.016711,0.007059,0.0,0.013387,0.045532,0.0,0.0,0.0,0.022731,0.00987,0.009638,0.00796,0.0
Bloodshot,0.005631,1.0,0.01311,0.006284,0.0,0.0,0.0,0.010029,0.0,0.0,0.021444,0.019221,0.010907,0.0,0.025646,0.0,0.0,0.0,0.00728,0.0,0.0,0.00867,0.0,0.025922,0.0,0.0,0.0,0.0,0.015599,0.0,0.0,0.004811,0.0,0.021219,0.0,0.0,0.005804,0.0,0.005404,0.068024,...,0.0,0.017252,0.0,0.0,0.017257,0.008702,0.0,0.020577,0.030002,0.0,0.0,0.067938,0.0,0.0,0.0,0.0,0.046839,0.0,0.013952,0.0,0.015297,0.0,0.0,0.038168,0.027492,0.0,0.00945,0.0,0.005656,0.016227,0.0,0.0,0.010645,0.073686,0.0,0.0,0.0,0.009376,0.0,0.0
Bad Boys for Life,0.0,0.01311,1.0,0.039164,0.033873,0.022098,0.0,0.0,0.0,0.0,0.0,0.013054,0.02431,0.0,0.0,0.0,0.0,0.019336,0.0,0.033137,0.012127,0.0,0.0,0.025114,0.0,0.0,0.0,0.0,0.011201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015291,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087155,0.0,0.0,0.0,0.0,0.0,0.0,0.005834,0.0,0.0,0.010851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005205,0.008207,0.0,0.0,0.004471,0.0,0.0,0.0,0.0,0.029544,0.0,0.0
Ant-Man,0.0,0.006284,0.039164,1.0,0.0,0.013115,0.0,0.022826,0.0,0.0,0.009433,0.0,0.020249,0.0,0.011282,0.0,0.0,0.0,0.0,0.028931,0.009552,0.0,0.0,0.035128,0.0,0.0,0.023861,0.025176,0.013255,0.0,0.0,0.0,0.0,0.005063,0.0,0.0,0.0,0.0,0.008169,0.009411,...,0.0,0.109153,0.03241,0.0,0.0,0.007599,0.0,0.006442,0.0,0.012243,0.0,0.0,0.0,0.0,0.0,0.004595,0.0,0.0,0.007354,0.007256,0.013358,0.0,0.007219,0.0,0.011226,0.0,0.0,0.020986,0.009038,0.018974,0.0,0.015955,0.003521,0.033932,0.0,0.0,0.0,0.0,0.065646,0.025341
Percy Jackson: Sea of Monsters,0.0,0.0,0.033873,0.0,1.0,0.0,0.023807,0.0,0.0,0.0,0.0,0.021512,0.032937,0.028183,0.046572,0.025002,0.019877,0.033229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095351,0.048077,0.063986,0.0,0.0,0.0,0.0,0.0,0.0,0.031502,0.0,0.029908,...,0.027605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010734,0.0,0.0,0.0,0.011262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
#Function for getting similar movies to the given input movie.
def recommend(title, cos_df=cos_df):
  best_movies = cos_df.loc[str(title)].sort_values(ascending=False).head()[1:].index #to get the best 4
  print(f"If you like", title)
  print(f"\nYou'll like:", *best_movies, sep="\n")

In [75]:
#Using Joker as an example
recommend('Joker')

If you like Joker

You'll like:
Batman Beyond: Return of the Joker
Batman
Batman: Gotham by Gaslight
Get Rich or Die Tryin'
