In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('/content/drive/MyDrive/5000 imdb.zip')

In [None]:
!pip install gradio==3.45.0



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import gradio as gr

In [None]:
data.shape

(10000, 11)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,adult,genre_ids,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count
0,0,False,"[18, 80]",en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",114.774,1972-03-14,The Godfather,8.7,17855
1,1,False,"[18, 80]",en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,90.925,1994-09-23,The Shawshank Redemption,8.7,23711
2,2,False,"[35, 14]",es,Cuando Sea Joven,70-year-old Malena gets a second chance at lif...,29.101,2022-09-14,Cuando Sea Joven,8.6,214
3,3,False,"[18, 80]",en,The Godfather Part II,In the continuing saga of the Corleone crime f...,54.944,1974-12-20,The Godfather Part II,8.6,10801
4,4,False,"[18, 36, 10752]",en,Schindler's List,The true story of how businessman Oskar Schind...,55.735,1993-12-15,Schindler's List,8.6,14026


# **NON ENGLISH MOVIES**

In [None]:
non_english_count = data[data['original_language'] != 'en'].shape[0]
print(non_english_count)

2389


In [None]:
languages = data['original_language'].unique()
print(languages)
print(len(languages))

['en' 'es' 'hi' 'ja' 'ko' 'it' 'pt' 'zh' 'ru' 'fr' 'tr' 'sv' 'hu' 'ar'
 'de' 'cn' 'da' 'pl' 'bn' 'nl' 'fa' 'th' 'te' 'sr' 'sh' 'et' 'xx' 'id'
 'cs' 'no' 'uk' 'ro' 'gl' 'el' 'bs' 'hy' 'fi' 'is' 'ml' 'la' 'tn' 'eu'
 'nb' 'he' 'km' 'mk']
46


In [None]:
data = data[['title', 'overview', 'popularity', 'original_language']]
data.head()

Unnamed: 0,title,overview,popularity,original_language
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",114.774,en
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,90.925,en
2,Cuando Sea Joven,70-year-old Malena gets a second chance at lif...,29.101,es
3,The Godfather Part II,In the continuing saga of the Corleone crime f...,54.944,en
4,Schindler's List,The true story of how businessman Oskar Schind...,55.735,en


# **Missing Values**

In [None]:
data.isnull().sum()

title                0
overview             1
popularity           0
original_language    0
dtype: int64

In [None]:
data = data.dropna(subset =['overview'])

In [None]:
data.isna().sum()

title                0
overview             0
popularity           0
original_language    0
dtype: int64

In [None]:
data.shape

(9999, 4)

# **Combining Features**

In [None]:
data['combined_features'] = data['overview'] + data['popularity'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['combined_features'] = data['overview'] + data['popularity'].astype('str')


In [None]:
data.head()

Unnamed: 0,title,overview,popularity,original_language,combined_features
0,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",114.774,en,"Spanning the years 1945 to 1955, a chronicle o..."
1,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,90.925,en,Framed in the 1940s for the double murder of h...
2,Cuando Sea Joven,70-year-old Malena gets a second chance at lif...,29.101,es,70-year-old Malena gets a second chance at lif...
3,The Godfather Part II,In the continuing saga of the Corleone crime f...,54.944,en,In the continuing saga of the Corleone crime f...
4,Schindler's List,The true story of how businessman Oskar Schind...,55.735,en,The true story of how businessman Oskar Schind...


# **TF-IDF VECTORIZER**

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english') #stowords

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_features'])

# **Cosine Similarity Matrix**

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
print(len(cosine_sim))

9999


In [None]:
print(cosine_sim)

[[1.         0.00452209 0.00617811 ... 0.01586027 0.01068482 0.        ]
 [0.00452209 1.         0.00556688 ... 0.         0.00962773 0.        ]
 [0.00617811 0.00556688 1.         ... 0.         0.01315345 0.02794116]
 ...
 [0.01586027 0.         0.         ... 1.         0.         0.        ]
 [0.01068482 0.00962773 0.01315345 ... 0.         1.         0.        ]
 [0.         0.         0.02794116 ... 0.         0.         1.        ]]


# **Finding Recommendations**

In [None]:
def get_recommendations(title, cosine_sim = cosine_sim):
  #get the Index of the Movie that matched the title
  idx = data[data['title'] == title].index[0]

  #get the pairwise similarity scores for all the movies with our input movie
  sim_scores = list(enumerate(cosine_sim[idx]))

  #sort these movies base on the similarity score
  sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
  #getting the scores for top 10 movies
  sim_scores = sim_scores[1:11]

  #getting the movie indices
  movie_indices =[i[0] for i in sim_scores]

  #return the top 10 similar movies
  return data['title'].iloc[movie_indices]

In [None]:
movie_title = 'The Shawshank Redemption'
recommendations = get_recommendations(movie_title)
print(recommendations)

6374                   In Hell
206               Sherlock Jr.
4943               Escape Plan
3305                  Brubaker
6782    The 40 Year Old Virgin
4745                Demolition
4474                  One Shot
835                 The Chorus
762             Cool Hand Luke
7254                 No Escape
Name: title, dtype: object


# **Creating Interface**

In [None]:
movies = list(data['title'])

In [None]:
def recommend_movies(movies_name):
 recommendations = get_recommendations(movies_name)
 return recommendations

Create an output textbox for Displaying Recommendations

In [None]:
output_text = gr.Textbox(labels ='Recommended Movies')

  output_text = gr.Textbox(labels ='Recommended Movies')


In [None]:
input_dropdown = gr.Dropdown(choices = movies, label = 'Select a movie')

In [None]:
gr.Interface(recommend_movies, inputs = input_dropdown, outputs = output_text, title = 'Diya: RS').launch(share = True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://f0c22183e41411aa31.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


