In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from collections import Counter

In [2]:
# Loading the Dataset
file_path = Path('./Resources/clean_tvfilm.csv')
tvfilm_df = pd.read_csv(file_path)
tvfilm_df.head()

Unnamed: 0,filmtv_id,title,year,genre,duration,country,directors,actors,avg_vote,critics_vote,public_vote,total_votes,description,notes,humor,rhythm,effort,tension,erotism
0,3,18 anni tra una settimana,1991,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",6.5,6.0,7,4,"Samantha, not yet eighteen, leaves the comfort...","Luigi Perelli, the director of the ""Piovra"", o...",0,2,0,2,0
1,17,Ride a Wild Pony,1976,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.6,6.0,5,9,"In the Australia of the pioneers, a boy and a ...","""Ecological"" story with a happy ending, not wi...",1,2,1,0,0
2,18,Diner,1982,Comedy,95,United States,Barry Levinson,"Mickey Rourke, Steve Guttenberg, Ellen Barkin,...",7.0,8.0,6,18,Five boys from Baltimore have a habit of meeti...,A cast of will be famous for Levinson's direct...,2,2,0,1,2
3,20,A che servono questi quattrini?,1942,Comedy,85,Italy,Esodo Pratelli,"Eduardo De Filippo, Peppino De Filippo, Clelia...",5.9,5.33,7,15,"With a stratagem, the penniless and somewhat p...",Taken from the play by Armando Curcio that the...,3,1,1,0,0
4,21,The Uranian Conspiracy,1978,Spy,117,"Italy, Germany, Israel","Gianfranco Baldanello, Menahem Golan","Fabio Testi, Janet Agren, Assaf Dayan, Siegfri...",4.8,3.5,6,3,Two Israeli secret agents discover that traffi...,"Action and chases for half of Europe, espionag...",1,2,0,2,0


In [3]:
# Creating a threshold for 'avg_vote' column
threshold = 7

# Creating a new column determining the films success
tvfilm_df['popular'] = tvfilm_df['avg_vote'].apply(lambda x: 0 
                                                         if x >= threshold else 
                                                         1)
# Splitting the data into training and testing
X = tvfilm_df[['humor', 'rhythm', 'effort', 'tension', 'erotism']]
y = tvfilm_df['popular']

In [4]:
X.describe()

Unnamed: 0,humor,rhythm,effort,tension,erotism
count,17073.0,17073.0,17073.0,17073.0,17073.0
mean,0.891583,2.02501,1.037428,1.373279,0.482692
std,0.981624,0.82701,1.233634,1.097787,0.742706
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,0.0,0.0
50%,1.0,2.0,1.0,1.0,0.0
75%,2.0,2.0,2.0,2.0,1.0
max,5.0,5.0,5.0,5.0,4.0


In [5]:
# Check the balance of our target values
y.value_counts()

1    12887
0     4186
Name: popular, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(12804, 5)

In [7]:
### OVERSAMPLING
## Native Random Oversampling
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 9665, 1: 9665})

In [8]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [9]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7012227006664427

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 668,  379],
       [ 759, 2463]])

In [11]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.64      0.76      0.54      0.70      0.48      1047
          1       0.87      0.76      0.64      0.81      0.70      0.49      3222

avg / total       0.77      0.73      0.67      0.75      0.70      0.49      4269

