In [None]:
#Import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading the Data 
df=pd.read_csv("movie_metadata.csv")
#Displaying the first 5 records
df.head(5)

In [None]:
#Dropping useless columns 
df.drop('movie_title',axis=1,inplace=True)
df.drop('language',axis=1,inplace=True)
df.drop('plot_keywords',axis=1,inplace=True)
df.drop('genres',axis=1,inplace =True)
df.drop('movie_imdb_link', axis=1, inplace=True)
df.drop('color',axis=1,inplace=True)
df.drop('actor_1_name',axis=1,inplace=True)
df.drop('actor_2_name',axis=1,inplace=True)
df.drop('actor_3_name',axis=1,inplace=True)
df.drop('director_name', axis=1, inplace=True)

In [None]:
# check the null values
df.isna().sum()

In [None]:
df.columns

In [None]:
# deal with missing values in the dataset
df.dropna(axis=0,subset=['num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'gross', 'num_voted_users',
       'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'country', 'content_rating', 'budget',
       'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio',
       'movie_facebook_likes'],inplace=True)

#Replacing valus
df["content_rating"].fillna("R18", inplace = True) 
df["aspect_ratio"].fillna(df["aspect_ratio"].median(),inplace=True)
df["budget"].fillna(df["budget"].median(),inplace=True)
df['gross'].fillna(df['gross'].median(),inplace=True)

In [None]:
# check the null values again
df.isna().sum()

In [None]:
#Removing the duplicate values in the datset
df.drop_duplicates(inplace=True)

In [None]:
#combine facebook likes of actor 2 and actor 3
df['Other_actor_facebbok_likes']=df["actor_2_facebook_likes"] + df['actor_3_facebook_likes']
df.drop('actor_2_facebook_likes',axis=1,inplace=True)
df.drop('actor_3_facebook_likes',axis=1,inplace=True)
df.drop('cast_total_facebook_likes',axis=1,inplace=True)
#create the ratio of num_user_for_reviews and num_critic_for_reviews.
df['critic_review_ratio']=df['num_critic_for_reviews']/df['num_user_for_reviews']
df.drop('num_critic_for_reviews',axis=1,inplace=True)
df.drop('num_user_for_reviews',axis=1,inplace=True)

In [None]:
# Correlation matrix shown in the figure 
corr = df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
a = sns.heatmap(corr, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
#deal with categorical data
value_counts=df["country"].value_counts()
vals = value_counts[:2].index
df['country'] = df.country.where(df.country.isin(vals), 'other')
df = pd.get_dummies(data = df, columns = ['country'] , prefix = ['country'] , drop_first = True)
df = pd.get_dummies(data = df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)

In [None]:
def plotHistogram(df,col,tit='',xlabel='',ylabel='',log=False):
    bins = np.linspace(df[col].min(),df[col].max(),25)
    plt.xlim([df[col].min(),df[col].max()])
    plt.hist(df[col], bins=bins, alpha=0.5,log=log)
    plt.title(tit)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

plotHistogram(df,'imdb_score',ylabel='# of films',xlabel='imdb_score',tit='imdb_score logarithmic histogram',log=True)

In [None]:
cat_cols = list(df.select_dtypes(include='object').columns)
num_cols = list(df.select_dtypes(exclude='object').columns)


In [None]:
_, ax = plt.subplots(figsize=(18, 6))
sns.regplot(x=df['budget'].apply(np.log10), y=df['gross'], ax=ax)

In [None]:
_, ax = plt.subplots(figsize=(14, 6))
sns.barplot(x=df['imdb_score'].apply(np.round), y=df['gross'], ax=ax)

plt.xlabel('IMDB Score'); plt.xlabel('Year');

In [None]:
sns.pairplot(df[num_cols])

In [None]:
# categorize the imdb values to new columns imdb_score_y
df["imdb_score_y"]=pd.cut(df['imdb_score'], bins=[0,4,6,8,10], right=True, labels=False)+1
df.drop('imdb_score',axis=1,inplace=True)

In [None]:
df.columns

In [None]:
# Splitting the data into training and testing data
X=pd.DataFrame(columns=['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross',
       'num_voted_users', 'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'budget', 'title_year',
       'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes',
       'country_USA', 'country_other', 'content_rating_G',
       'content_rating_GP', 'content_rating_M', 'content_rating_NC-17',
       'content_rating_Not Rated', 'content_rating_PG', 'content_rating_PG-13',
       'content_rating_Passed', 'content_rating_R', 'content_rating_Unrated',
       'content_rating_X'],data=df)
y=pd.DataFrame(columns=['imdb_score_y'],data=df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logit =LogisticRegression()
logit.fit(X_train,np.ravel(y_train,order='C'))
y_pred=logit.predict(X_test)

#Confusion matrix for logistic regression**
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#KNN 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=22)
knn.fit(X_train, np.ravel(y_train,order='C'))
knnpred = knn.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, knnpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, knnpred))

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 200)#criterion = entopy,gini
rfc.fit(X_train, np.ravel(y_train,order='C'))
rfcpred = rfc.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, rfcpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, rfcpred))

In [None]:
#Model Comparison
from sklearn.metrics import classification_report
print('Logistic  Reports\n',classification_report(y_test, y_pred))
print('KNN Reports\n',classification_report(y_test, knnpred))
print('Random Forests Reports\n',classification_report(y_test, rfcpred))

In [None]:
#The conclusion is that Random Forest Algorithm have best accuracy which is around 77%