In [1]:
import pandas as pd

In [2]:
data=pd.read_csv('movies_data.csv', encoding='latin1')

In [3]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
data.shape

(15509, 10)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


# FIND ANOMALIES IN DATA

In [6]:
# data is missing
data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [7]:
# remove brackets in year column
data.Year.unique()

array([nan, '(2019)', '(2021)', '(2010)', '(1997)', '(2005)', '(2008)',
       '(2012)', '(2014)', '(2004)', '(2016)', '(1991)', '(1990)',
       '(2018)', '(1987)', '(1948)', '(1958)', '(2017)', '(2020)',
       '(2009)', '(2002)', '(1993)', '(1946)', '(1994)', '(2007)',
       '(2013)', '(2003)', '(1998)', '(1979)', '(1951)', '(1956)',
       '(1974)', '(2015)', '(2006)', '(1981)', '(1985)', '(2011)',
       '(2001)', '(1967)', '(1988)', '(1995)', '(1959)', '(1996)',
       '(1970)', '(1976)', '(2000)', '(1999)', '(1973)', '(1968)',
       '(1943)', '(1953)', '(1986)', '(1983)', '(1989)', '(1982)',
       '(1977)', '(1957)', '(1950)', '(1992)', '(1969)', '(1975)',
       '(1947)', '(1972)', '(1971)', '(1935)', '(1978)', '(1960)',
       '(1944)', '(1963)', '(1940)', '(1984)', '(1934)', '(1955)',
       '(1936)', '(1980)', '(1966)', '(1949)', '(1962)', '(1964)',
       '(1952)', '(1933)', '(1942)', '(1939)', '(1954)', '(1945)',
       '(1961)', '(1965)', '(1938)', '(1941)', '(1931)', 

In [8]:
# removes comma because it creates issue as our numerical operation
data.Votes.unique()

array([nan, '8', '35', ..., '70,344', '408', '1,496'], dtype=object)

In [9]:
# alot of duplicated data
data['Name'].duplicated().sum()

1671

In [10]:
data.dropna(subset=['Name','Year','Duration','Votes','Rating'],inplace=True)

In [11]:
data.Votes=data.Votes.str.replace(",","").astype(int)


In [12]:
data.Duration=data.Duration.str.replace("min","").astype(int)


In [13]:
data.Year=data.Year.str.strip('()').astype(int)

In [14]:
data.isnull().sum()

Name          0
Year          0
Duration      0
Genre        31
Rating        0
Votes         0
Director      1
Actor 1      75
Actor 2     117
Actor 3     163
dtype: int64

In [43]:
data['Genre'].value_counts()

Genre
Drama                       870
Drama, Romance              334
Action, Crime, Drama        329
Action, Drama               207
Comedy, Drama               206
                           ... 
History, Romance              1
Drama, History, Sport         1
Animation, Comedy, Drama      1
Family, Drama, Thriller       1
Romance, Musical, Drama       1
Name: count, Length: 393, dtype: int64

In [44]:
data['Genre'].fillna('Drama',inplace=True)

In [45]:
data['Director'].value_counts()

Director
David Dhawan            41
Mahesh Bhatt            39
Ram Gopal Varma         34
Shakti Samanta          34
Hrishikesh Mukherjee    33
                        ..
Dayanand                 1
Santosh Kashyap          1
Manav Kaul               1
Sajeev Balath            1
Mozez Singh              1
Name: count, Length: 2549, dtype: int64

In [46]:
data.Director.fillna('David Dhawan', inplace=True)

# Data Preprocessing

In [64]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['Genre_coded']=le.fit_transform(data['Genre'])
data['Director_coded']=le.fit_transform(data['Director'])

In [65]:
x=data[['Year','Duration','Votes','Genre_coded','Director_coded']]

In [66]:
y=data[['Rating']]

# Model Building

In [79]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)

In [80]:
from sklearn.linear_model import LinearRegression

In [81]:
model=LinearRegression()

In [82]:
model.fit(x_train,y_train)

In [83]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [87]:
y_pred=model.predict(x_test)
mae=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)


In [88]:
print(mse)
print(mae)
print(r2)

1.667008058469811
1.0363597339487027
0.09340126870683907
