# Import Libraries

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

# Data Reading

In [8]:
movie_info=pd.read_csv('movie_list.csv')

In [9]:
movie_info

Unnamed: 0,Title,IMDB id,Year,Rating,Genre,Top_cast
0,3 Idiots,tt1187043,2009.0,8.4,"Comedy, Drama","Aamir Khan, Madhavan, Sharman Joshi"
1,Like Stars on Earth,tt0986264,2007.0,8.3,"Drama, Family","Darsheel Safary, Aamir Khan, Tanay Chheda"
2,PK,tt2338151,2014.0,8.1,"Comedy, Drama, Sci-Fi","Aamir Khan, Anushka Sharma, Saurabh Shukla"
3,Dangal,tt5074352,2016.0,8.3,"Action, Biography, Drama, Sport","Aamir Khan, Fatima Sana Shaikh, Sanya Malhotra"
4,Rang De Basanti,tt0405508,2006.0,8.1,"Comedy, Crime, Drama","Aamir Khan, Siddharth, Sharman Joshi"
...,...,...,...,...,...,...
3947,Mehmaan,,,,,
3948,Sorry Daddy,,,,,
3949,Raadha Aur Seeta,,,,,
3950,Dafaa 302: Indian Penal Code Section 302,,,,,


# Data Understanding

In [22]:
movie_info.columns

Index(['Title', 'IMDB id', 'Year', 'Rating', 'Genre', 'Top_cast'], dtype='object')

In [23]:
movie_info.shape

(3952, 6)

In [25]:
movie_info.isna().sum()

Title          0
IMDB id     2853
Year           0
Rating         0
Genre          0
Top_cast    2853
dtype: int64

In [27]:
movie_info.dtypes

Title        object
IMDB id      object
Year        float64
Rating      float64
Genre        object
Top_cast     object
dtype: object

# Data Preprocessing

 * Data Cleaning
 * Datatype Conversation
 * Data Transformation

### Data Cleaning

In [30]:
movie_info.isnull().sum()

Title          0
IMDB id     2853
Year           0
Rating         0
Genre          0
Top_cast    2853
dtype: int64

In [32]:
movie_info.dropna(inplace=True)

In [34]:
movie_info

Unnamed: 0,Title,IMDB id,Year,Rating,Genre,Top_cast
0,3 Idiots,tt1187043,2009.0,8.4,"Comedy, Drama","Aamir Khan, Madhavan, Sharman Joshi"
1,Like Stars on Earth,tt0986264,2007.0,8.3,"Drama, Family","Darsheel Safary, Aamir Khan, Tanay Chheda"
2,PK,tt2338151,2014.0,8.1,"Comedy, Drama, Sci-Fi","Aamir Khan, Anushka Sharma, Saurabh Shukla"
3,Dangal,tt5074352,2016.0,8.3,"Action, Biography, Drama, Sport","Aamir Khan, Fatima Sana Shaikh, Sanya Malhotra"
4,Rang De Basanti,tt0405508,2006.0,8.1,"Comedy, Crime, Drama","Aamir Khan, Siddharth, Sharman Joshi"
...,...,...,...,...,...,...
1094,Yamla Pagla Deewana Phir Se...,tt7609114,2018.0,4.4,"Action, Comedy, Drama","['DharmendraJeet Parmar', 'Sunny DeolPuran', '..."
1095,Saare Jahaan Se Mehnga...,tt2857500,2013.0,7.0,"Comedy, Drama","['Annapoorna', 'V.M. Badola', 'Rampal Barsewal']"
1096,Satya 2,tt3059106,2013.0,5.8,"Action, Crime, Drama","['Puneet Singh ', 'SharwanandSatya ', 'Anaika ..."
1097,Deewana Mujh Sa Nahin,tt0101742,1990.0,5.0,"Romance, Comedy, Drama","['Aamir KhanAjay Sharma', 'Madhuri DixitAnita'..."


In [36]:
movie_info.isna().sum()

Title       0
IMDB id     0
Year        0
Rating      0
Genre       0
Top_cast    0
dtype: int64

### Datatype Conversation

In [38]:
movie_info.dtypes

Title        object
IMDB id      object
Year        float64
Rating      float64
Genre        object
Top_cast     object
dtype: object

In [40]:
movie_info["Year"]=movie_info["Year"].astype('int')

In [41]:
movie_info.dtypes

Title        object
IMDB id      object
Year          int32
Rating      float64
Genre        object
Top_cast     object
dtype: object

### No need for Data Transformation

In [42]:
movie_info.head(10)

Unnamed: 0,Title,IMDB id,Year,Rating,Genre,Top_cast
0,3 Idiots,tt1187043,2009,8.4,"Comedy, Drama","Aamir Khan, Madhavan, Sharman Joshi"
1,Like Stars on Earth,tt0986264,2007,8.3,"Drama, Family","Darsheel Safary, Aamir Khan, Tanay Chheda"
2,PK,tt2338151,2014,8.1,"Comedy, Drama, Sci-Fi","Aamir Khan, Anushka Sharma, Saurabh Shukla"
3,Dangal,tt5074352,2016,8.3,"Action, Biography, Drama, Sport","Aamir Khan, Fatima Sana Shaikh, Sanya Malhotra"
4,Rang De Basanti,tt0405508,2006,8.1,"Comedy, Crime, Drama","Aamir Khan, Siddharth, Sharman Joshi"
5,Lagaan: Once Upon a Time in India,tt0169102,2001,8.1,"Drama, Musical, Sport","Aamir Khan, Gracy Singh, Rachel Shelley"
6,My Name Is Khan,tt1188996,2010,7.9,"Adventure, Drama, Romance","Shah Rukh Khan, Kajol, Katie Amanda Keane"
7,Swades,tt0367110,2004,8.2,"Drama, Musical","Shah Rukh Khan, Gayatri Joshi, Kishori Ballal"
8,Gangs of Wasseypur,tt1954470,2012,8.2,"Action, Comedy, Crime, Drama, Thriller","Manoj Bajpayee, Nawazuddin Siddiqui, Tigmanshu"
9,Gangs of Wasseypur,tt1954470,2012,8.2,"Action, Comedy, Crime, Drama, Thriller","Manoj Bajpayee, Nawazuddin Siddiqui, Tigmanshu"


# Model Building

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise  import linear_kernel

In [44]:
tfid=TfidfVectorizer()

In [46]:
vectors = tfid.fit_transform(movie_info['Genre'])

In [47]:
vectors[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.8297695 ,
        0.        , 0.        , 0.        , 0.        , 0.55810624,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [48]:
sim_distance = linear_kernel(vectors[0],vectors)
sim_distance

array([[1.        , 0.16488675, 0.30352976, ..., 0.20158839, 0.74799403,
        0.8297695 ]])

In [49]:
sim_score_df=pd.DataFrame(sim_distance).T
sim_score_df.columns=['Scores']

In [50]:
sim_score_df

Unnamed: 0,Scores
0,1.000000
1,0.164887
2,0.303530
3,0.105094
4,0.666622
...,...
1094,0.740096
1095,1.000000
1096,0.201588
1097,0.747994


In [51]:
index =pd.Series(data=movie_info.index,
    index=movie_info["Title"])

In [52]:
def recommend(name,n):
    sim_distance=linear_kernel(vectors[index[name]],vectors)
    sim_score_df=pd.DataFrame(sim_distance).T
    sim_score_df.columns=['Scores']
    sim_score_df=sim_score_df.sort_values("Scores", ascending=False)

    list=[]

    for i in range(0,n+1):
        result={'Title': movie_info["Title"][sim_score_df.index[i]],
                'Genre': movie_info["Genre"][sim_score_df.index[i]],
                'Rating': movie_info["Rating"][sim_score_df.index[i]]


        }

        list.append(result)

    return list

In [53]:
Recommendation_df = pd.DataFrame(recommend('Dangal',5))
Recommendation_df

Unnamed: 0,Title,Genre,Rating
0,Mary Kom,"Action, Biography, Drama, Sport",6.8
1,Dangal,"Action, Biography, Drama, Sport",8.3
2,Azhar,"Biography, Drama, Sport",5.7
3,M.S. Dhoni: The Untold Story,"Biography, Drama, Sport",8.0
4,Bhaag Milkha Bhaag,"Biography, Drama, Sport",8.2
5,Paan Singh Tomar,"Action, Biography, Crime, Sport, Thriller",8.2


In [13]:
movie_info['Year'].value_counts()

Year
2017.0    57
2013.0    54
2015.0    53
2016.0    50
2005.0    47
          ..
1978.0     1
1970.0     1
1971.0     1
1968.0     1
1966.0     1
Name: count, Length: 68, dtype: int64

In [14]:
movie_info['Year'].fillna(movie_info['Year'].mode()[0],inplace=True)

In [15]:
movie_info.isna().sum()

Title          0
IMDB id     2853
Year           0
Rating      2858
Genre       2853
Top_cast    2853
dtype: int64

In [18]:
movie_info['Rating'].fillna(movie_info['Rating'].mean(),inplace=True)

In [19]:
movie_info.isna().sum()


Title          0
IMDB id     2853
Year           0
Rating         0
Genre       2853
Top_cast    2853
dtype: int64

In [20]:
movie_info['Genre'].fillna(movie_info['Genre'].mode()[0],inplace=True)


In [21]:
movie_info['Genre'].value_counts()

Genre
Comedy, Drama, Romance                      2936
Drama, Romance                                66
Action, Crime, Drama                          63
Drama                                         44
Comedy, Drama                                 35
                                            ... 
Biography, Drama, History, War                 1
Action, Crime, Drama, History, Thriller        1
 Action, Thriller                              1
Action, Comedy, Drama, Fantasy, Horror         1
Romance, Comedy, Drama                         1
Name: count, Length: 315, dtype: int64

In [14]:
movie_info.isna().sum()

Title        0
IMDB id      0
Year         0
Rating       0
Genre        0
Top_cast    14
dtype: int64

In [15]:
movie_info['Top_cast'].value_counts()

Top_cast
Akshay KumarRaju, Suniel ShettyGhanshyam (Shyam)(as Sunil Shetty), Paresh RawalBaburao Ganpatrao Apte      3
Joseph Gordon-LevittJon, Scarlett JohanssonBarbara, Julianne MooreEsther                                   3
Shah Rukh KhanRahul Raichand, KajolAnjali Sharma, Amitabh BachchanYashvardhan Raichand                     3
Harshvardhan RaneInder Parihar, Mawra HocaneSaraswati 'Saru' Parthasarthy, Vijay RaazMustakeen Bhai        3
Simon PeggNandor Fodor, Minnie DriverAnne, Christopher LloydDr. Harry Price                                3
                                                                                                          ..
Salman KhanSuraj Dhanrajgir, Twinkle KhannaKomal Sinha, Johny LeverMahesh(as Johnny Lever)                 1
Riteish DeshmukhChampak Chandrakant Chiplunkar, Vivek OberoiAmjad Khan, Rhea ChakrabortyGayatri Ganguly    1
Disha Patani, Tiger Shroff, Vivek Oberoi                                                                   1
Kunal Kemm

In [16]:
movie_info['Top_cast'].fillna(movie_info['Top_cast'].mode()[0],inplace=True)

In [17]:
movie_info.isna().sum()

Title       0
IMDB id     0
Year        0
Rating      0
Genre       0
Top_cast    0
dtype: int64

In [18]:
movie_info.to_csv('final_data.csv')

PermissionError: [Errno 13] Permission denied: 'final_data.csv'

In [None]:
movie_info.dtypes

In [None]:
movie_info['Year']=movie_info['Year'].str.replace('â€“','-')

In [None]:
movie_info['Year']

In [None]:
#TfidVectorizer is NLP Concept
tfid=TfidfVectorizer()

In [None]:
vectors=tfid.fit_transform(movie_info['Genre'])

In [None]:
vectors[0].toarray()

In [None]:
similartiy_distance=linear_kernel(vectors[0],vectors)

In [None]:
similartiy_distance

In [None]:
similarity_scores=pd.DataFrame(similartiy_distance).T
similarity_scores.columns=['Scores']

In [None]:
similarity_scores

In [None]:
index=pd.Series(data=movie_info.index,index=movie_info['Title'])
index

In [None]:
def Movie_names(name,n):
    print(index[name])
    similartiy_distance=linear_kernel(vectors[index[name]],vectors)
    similarity_scores=pd.DataFrame(similartiy_distance).T
    similarity_scores.columns=['Scores']
    similarity_scores=similarity_scores.sort_values("Scores",ascending=False)
    empty_list=[]
    for i in range(0,n+1):
        result={'Title':movie_info['Title'][similarity_scores.index[i]],
            'Genre':movie_info['Genre'][similarity_scores.index[i]],
            'Rating': movie_info['Rating'][similarity_scores.index[i]]
            }
        empty_list.append(result)
    return empty_list

In [None]:
final_output=pd.DataFrame(Movie_names('PK',3))

In [19]:
final_output

NameError: name 'final_output' is not defined

In [None]:
with open('movie_info.pkl','wb')as f:
    pickle.dump(Movie_names,f)
print("saved successfully")