In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



from sklearn.feature_selection import RFE
import statsmodels.formula.api as smf


In [2]:
df = pd.read_csv('cleaned_data.csv')


## Which genre is more profitable? 

# Model Development


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
#values in gross_)bins
train['gross_bin'].value_counts()

1.000000    755
0.000000    752
0.666667    751
0.333333    740
Name: gross_bin, dtype: int64

### Logistic Regression  : gross revenue

In [5]:
df.columns

Index(['director_name', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes',
       'gross', 'actor_1_name', 'movie_title', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'language', 'country', 'content_rating', 'budget',
       'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'profit',
       'gross_bin', 'action', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'musical', 'mystery', 'romance', 'sci_fi', 'sport',
       'thriller', 'war', 'western', 'director_high', 'director_low',
       'actor_1_high', 'actor_1_low', 'actor_2_high', 'actor_2_low',
       'actor_3_high', 'actor_3_low', 'is_english', 'profit_binary'],
      dtype='object')

In [6]:

gross_genre = smf.logit(formula = 'gross_bin ~ animation + family + adventure + sci_fi+ fantasy', data = train).fit() 
gross_genre.summary()


Optimization terminated successfully.
         Current function value: 0.655273
         Iterations 5


0,1,2,3
Dep. Variable:,gross_bin,No. Observations:,2998.0
Model:,Logit,Df Residuals:,2992.0
Method:,MLE,Df Model:,5.0
Date:,"Sun, 05 Mar 2023",Pseudo R-squ.:,0.05463
Time:,23:26:09,Log-Likelihood:,-1964.5
converged:,True,LL-Null:,-2078.0
Covariance Type:,nonrobust,LLR p-value:,4.552e-47

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2842,0.044,-6.403,0.000,-0.371,-0.197
animation,0.0064,0.229,0.028,0.978,-0.443,0.456
family,0.6697,0.154,4.341,0.000,0.367,0.972
adventure,0.7471,0.107,6.988,0.000,0.538,0.957
sci_fi,0.3003,0.118,2.536,0.011,0.068,0.532
fantasy,0.2382,0.122,1.954,0.051,-0.001,0.477


In [7]:
# predict the values for the test set using the trained model
y_pred = gross_genre.predict(test)

# convert predicted probabilities to class labels (0 or 1)
y_pred_class = np.where(y_pred > 0.5, 1, 0)

# compare predicted class labels with actual class labels in the test set
accuracy = np.mean(y_pred_class == test['gross_bin'])
print('Accuracy:', accuracy)

Accuracy: 0.3453333333333333


### Logistic Regression: Profit

In [8]:
# create bins based on quartiles of the "gross" variable
train['profit'], bins = pd.qcut(train['profit'], q=4, retbins=True, labels=False)

# normalize "gross_bins" to have values in the range [0, 1]
train['profit'] = (train['profit'] - np.min(train['profit'])) / (np.max(train['profit']) - np.min(train['profit']))

test['profit'] = pd.cut(test['profit'], bins=bins, labels=False)
test['profit'] = (test['profit'] - np.min(test['profit'])) / (np.max(test['profit']) - np.min(test['profit']))


In [9]:
 
profit_genre = smf.logit(formula = 'profit ~ animation + family + adventure + sci_fi + fantasy', data = train).fit() #Developing logistic regression model



Optimization terminated successfully.
         Current function value: 0.689764
         Iterations 4


In [10]:
# predict  values for the test set using the trained model
y_pred = profit_genre.predict(test)

# convert predicted probabilities to class labels (0 or 1)
y_pred_class = np.where(y_pred > 0.5, 1, 0)

# compare predicted class labels with actual class labels in the test set
accuracy = np.mean(y_pred_class == test['profit'])
print('Accuracy:', accuracy)


Accuracy: 0.276


### Logistic Regression: Postitive or negative profit

In [11]:
#create binary column for pos or neg profit
train['profit_binary'] = np.where(train['profit'] > 0, 1, 0)
test['profit_binary'] = np.where(test['profit'] > 0, 1, 0)

#create logistic regression model for profit_binary and top 5 genres
profit_binary_genre = smf.logit(formula = 'profit_binary ~ animation + family + adventure + sci_fi + fantasy', data = train).fit() #Developing logistic regression model

# predict  values for the test set using the trained model
y_pred = profit_binary_genre.predict(test)

# convert predicted probabilities to class labels (0 or 1)
y_pred_class = np.where(y_pred > 0.5, 1, 0)

# compare predicted class labels with actual class labels in the test set
accuracy = np.mean(y_pred_class == test['profit_binary'])
print('Accuracy:', accuracy)


Optimization terminated successfully.
         Current function value: 0.551892
         Iterations 5
Accuracy: 0.7226666666666667


In [12]:
#find precision
from sklearn.metrics import precision_score
precision_score(test['profit_binary'], y_pred_class)


0.72911051212938

### Same, all generes 

In [13]:
#logistic regression model for profit_binary and all genres
profit_binary_genre = smf.logit(formula = 'profit_binary ~ action + biography + comedy + crime + documentary + drama + history + horror + music + musical + mystery + romance + sport + thriller + war + western', data = train).fit() #Developing logistic regression model

# predict  values for the test set using the trained model
y_pred = profit_binary_genre.predict(test)

# convert predicted probabilities to class labels (0 or 1)
y_pred_class = np.where(y_pred > 0.5, 1, 0)

# compare predicted class labels with actual class labels in the test set
accuracy = np.mean(y_pred_class == test['profit_binary'])
print('Accuracy:', accuracy)


Optimization terminated successfully.
         Current function value: 0.535659
         Iterations 7
Accuracy: 0.7346666666666667


In [14]:
#find the most important features
profit_binary_genre.params.sort_values(ascending=False)
#list of 5 most important features from profit_binary_genre, excluding intercept 
top_5 = list(profit_binary_genre.params.sort_values(ascending=False)[1:7].index)
top_5 = ['horror', 'musical', 'music', 'comedy', 'romance']
#new model with only top 5 genres
profit_binary_genre = smf.logit(formula = 'profit_binary ~ horror + musical + music + comedy + romance', data = train).fit() #Developing logistic regression model

# predict  values for the test set using the trained model
y_pred = profit_binary_genre.predict(test)

# convert predicted probabilities to class labels (0 or 1)
y_pred_class = np.where(y_pred > 0.5, 1, 0)

# compare predicted class labels with actual class labels in the test set
accuracy = np.mean(y_pred_class == test['profit_binary'])
print('Accuracy:', accuracy)


Optimization terminated successfully.
         Current function value: 0.554714
         Iterations 6
Accuracy: 0.7306666666666667


### Linear Regrssion : Profit 