In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [47]:
data = pd.read_csv("IMDb Movies India.csv",encoding="ISO-8859-1")
data.head(10)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
7,.in for Motion,(2008),59 min,Documentary,,,Anirban Datta,,,
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
9,@Andheri,(2014),116 min,"Action, Crime, Thriller",4.0,11.0,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [49]:
data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [43]:
try:
    data = pd.read_csv("IMDb Movies India.csv", encoding="ISO-8859-1")  # Adjust the encoding as needed
except UnicodeDecodeError:
    print("UnicodeDecodeError encountered. Trying a different encoding...")
    # Try a different encoding, like 'ISO-8859-1' or 'Windows-1252'
    data = pd.read_csv("IMDb Movies India.csv", encoding="Windows-1252")

In [44]:
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [45]:
# Step 1: Handle Missing Data
# Drop columns with excessive missing values (e.g., 'Duration' with 8269 missing)
data.drop(columns=['Duration'], inplace=True)

# Drop rows where critical information is missing
data.dropna(subset=['Name'], inplace=True)  # Name is critical; drop rows with missing names

# Impute missing numerical values with median
numerical_imputer = SimpleImputer(strategy='median')
data[['Year', 'Rating', 'Votes']] = numerical_imputer.fit_transform(data[['Year', 'Rating', 'Votes']])

# Impute missing categorical values with a constant
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
data[['Director', 'Actor 1', 'Actor 2', 'Actor 3']] = categorical_imputer.fit_transform(data[['Director', 'Actor 1', 'Actor 2', 'Actor 3']])

# Step 2: Categorical Encoding
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
encoded_features = encoder.fit_transform(data[categorical_features])

# Create DataFrame for encoded features and concatenate with original data
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
data = pd.concat([data.drop(columns=categorical_features), encoded_df], axis=1)

# Step 3: Feature Scaling
scaler = StandardScaler()
numerical_features = ['Year', 'Rating', 'Votes']  # Add more if needed
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Step 4: Train-Test Split
# Separate the target variable (e.g., 'Rating')
target = data['Rating']  # Adjust the target column name as needed

# Drop non-relevant columns and the target column from features
X = data.drop(columns=['Rating', 'Name'])  # Add or adjust column names as needed

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: '(2019)'