# Movie Recommendation System using Machine Learning

# Aim
# To develop a machine learning-based recommendation system that suggests personalized movie recommendations to users based on their viewing history, preferences, and movie features.

In [1]:
# Step 1: Import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


##### Imports Pandas and NumPy for handling and analyzing movie data efficiently.
##### Uses TF-IDF Vectorizer to convert text features like genre, actor, and director into numerical form.
##### Applies Cosine Similarity to measure how similar two movies are based on their feature vectors.


In [2]:
# Load your dataset
df = pd.read_csv("Tamil_movies_dataset.csv")

In [4]:
# Show first few rows
df.head()

Unnamed: 0,MovieName,Genre,Rating,Director,Actor,PeopleVote,Year,Hero_Rating,movie_rating,content_rating
0,Mouna Guru,Action,7.7,Santha Kumar,Arulnithi,746,2011,8,8,7.9
1,7 Aum Arivu,Action,6.2,A.R. Murugadoss,Suriya,9479,2011,9,9,8.066667
2,Vaagai Sooda Vaa,Comedy,8.0,A. Sarkunam,Vimal,14522,2011,8,7,7.666667
3,Mankatha,Action,7.6,Venkat Prabhu,Ajith Kumar,12276,2011,6,8,7.2
4,Kanchana: Muni 2,Comedy,6.5,Lawrence Raghavendra,Lawrence Raghavendra,1044,2011,8,9,7.833333


In [5]:
df.tail()

Unnamed: 0,MovieName,Genre,Rating,Director,Actor,PeopleVote,Year,Hero_Rating,movie_rating,content_rating
324,Dhilluku Dhuddu 2,Comedy,5.3,Rambala,Santhanam,497,2019,7,9,7.1
325,Dev,Action,4.8,Rajath Ravishankar,Karthi,724,2019,5,8,5.933333
326,Charlie Chaplin 2,Comedy,3.8,Sakthi Chidambaram,Prabhu Deva,215,2019,4,7,4.933333
327,Petta,Action,7.3,Karthik Subbaraj,Rajinikanth,7545,2019,8,8,7.766667
328,Viswasam,Action,6.7,Siva,Ajith Kumar,5907,2019,8,9,7.9


In [6]:
# Display column names
print(df.columns)

Index(['MovieName', 'Genre', 'Rating', 'Director', 'Actor', 'PeopleVote',
       'Year', 'Hero_Rating', 'movie_rating', 'content_rating'],
      dtype='object')


Displays all the column names in the dataset to understand its structure and available features.

In [8]:
# Fill missing values with empty string
for col in ['Genre', 'Director', 'Actor', 'content_rating']:
    df[col] = df[col].fillna('')

# Combine useful info into one text feature
df['combined_features'] = (
    df['Genre'] + ' ' +
    df['Director'] + ' ' +
    df['Actor'] + ' ' +
    df['content_rating'].astype(str)
)


##### Missing values in important columns are replaced with empty strings to avoid errors during processing.

##### Key movie details like genre, director, actor, and content rating are merged into a single text column.

##### This combined feature helps the model compare movies more effectively based on their overall content.

In [9]:
# Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Compute similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


##### The TF-IDF vectorizer converts the combined text data into numerical vectors, ignoring common English words.

##### Each movie is represented as a vector based on its textual features like genre, director, and actor.

##### Cosine similarity is then calculated to measure how closely related each pair of movies is.

In [10]:
# Map movie names to their index
indices = pd.Series(df.index, index=df['MovieName']).drop_duplicates()

def recommend(movie_name):
    if movie_name not in indices:
        return ["Movie not found in dataset!"]
    
    idx = indices[movie_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # top 5 similar
    
    movie_indices = [i[0] for i in sim_scores]
    return df['MovieName'].iloc[movie_indices].tolist()


##### Creates a mapping between each movie name and its index for quick lookup.

##### The recommend() function finds the most similar movies using cosine similarity scores.

##### Returns the top 5 movies that are most alike to the selected movie based on their features.

In [14]:
movie = "Adithya Varma"   # üîπ Replace with any movie name from your dataset
print(f" Recommended movies for '{movie}':\n")
print(recommend(movie))

 Recommended movies for 'Adithya Varma':

['Deiva Thirumagal', 'Kennedy Club', 'I', 'Nota', '24']


##### The input movie ‚ÄúAdithya Varma‚Äù is used to generate recommendations from the dataset.

##### The model compares its features with all other movies using cosine similarity.

##### The system successfully suggests five similar movies: Deiva Thirumagal, Kennedy Club, I, Nota, and 24.

In [12]:
import joblib

# Replace 'data' with your actual dataset variable (like df or movies)
joblib.dump(df, 'movie_data.pkl')          # Save dataset
joblib.dump(cosine_sim, 'similarity.pkl')  # Save similarity matrix

print("‚úÖ Model and data saved successfully with Joblib!")


‚úÖ Model and data saved successfully with Joblib!


##### Saves the processed dataset and similarity matrix as .pkl files using Joblib for easy reuse without recalculating.

#### Overall Insights

#### This project is a content-based Tamil movie recommendation system that suggests similar movies based on their features such as genre, director, actor, and content rating. 
#### The dataset is cleaned and combined into a single text feature, which is then converted into numerical form using TF-IDF Vectorization. 
#### By applying Cosine Similarity, the system measures how closely related two movies are and recommends the top five most similar ones. 
#### Finally, the processed data and similarity model are saved using Joblib for easy future access.