In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from collections import Counter

In [2]:
# Loading the Dataset
file_path = Path('./Resources/clean_tvfilm.csv')
tvfilm_df = pd.read_csv(file_path)
tvfilm_df.head()

Unnamed: 0,filmtv_id,title,year,genre,duration,country,directors,actors,avg_vote,critics_vote,public_vote,total_votes,description,notes,humor,rhythm,effort,tension,erotism
0,3,18 anni tra una settimana,1991,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",6.5,6.0,7,4,"Samantha, not yet eighteen, leaves the comfort...","Luigi Perelli, the director of the ""Piovra"", o...",0,2,0,2,0
1,17,Ride a Wild Pony,1976,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.6,6.0,5,9,"In the Australia of the pioneers, a boy and a ...","""Ecological"" story with a happy ending, not wi...",1,2,1,0,0
2,18,Diner,1982,Comedy,95,United States,Barry Levinson,"Mickey Rourke, Steve Guttenberg, Ellen Barkin,...",7.0,8.0,6,18,Five boys from Baltimore have a habit of meeti...,A cast of will be famous for Levinson's direct...,2,2,0,1,2
3,20,A che servono questi quattrini?,1942,Comedy,85,Italy,Esodo Pratelli,"Eduardo De Filippo, Peppino De Filippo, Clelia...",5.9,5.33,7,15,"With a stratagem, the penniless and somewhat p...",Taken from the play by Armando Curcio that the...,3,1,1,0,0
4,21,The Uranian Conspiracy,1978,Spy,117,"Italy, Germany, Israel","Gianfranco Baldanello, Menahem Golan","Fabio Testi, Janet Agren, Assaf Dayan, Siegfri...",4.8,3.5,6,3,Two Israeli secret agents discover that traffi...,"Action and chases for half of Europe, espionag...",1,2,0,2,0


In [3]:
# get top 5 genres
top_5_genre=list(tvfilm_df['genre'].value_counts().head(n=5).index)
# [8, 5, 24, 13, 0]

tvfilm_df['genre_grouped']=tvfilm_df['genre'].apply(lambda x: x if x in top_5_genre else 999)
tvfilm_df['genre_grouped'].value_counts()

Drama       5159
Comedy      4883
999         4152
Thriller    1252
Horror       832
Action       795
Name: genre_grouped, dtype: int64

In [4]:
# binary indicator for US or not
tvfilm_df['is_us']=tvfilm_df['country']=='United States'

In [5]:
# get top 5 country
top_5_country=list(tvfilm_df['country'].value_counts().head(n=5).index)
# 

tvfilm_df['country_grouped']=tvfilm_df['country'].apply(lambda x: x if x in top_5_country else 'other')
tvfilm_df['country_grouped'].value_counts()

United States    7122
other            4160
Italy            3904
France            871
Great Britain     782
Japan             234
Name: country_grouped, dtype: int64

In [6]:
all_actors=tvfilm_df['actors'].str.cat(sep=', ')
actors_list=all_actors.split(', ')
actors_dict={}
for each_actor in actors_list: 
    if each_actor in actors_dict: 
        actors_dict[each_actor]+=1
    else: 
        actors_dict[each_actor]=1
actors_dict

{'Kim Rossi Stuart': 18,
 'Simona Cavallari': 5,
 'Ennio Fantastichini': 38,
 'Orso Maria Guerrini': 9,
 'Silli Togni': 3,
 'Maria Pia Calzone': 8,
 'Luciano Curreli': 3,
 'Francesco Maria Dominedò': 1,
 'Michael Craig': 4,
 'John Meillon': 4,
 'Eva Griffith': 1,
 'Graham Rouse': 1,
 'Robert Bettles': 1,
 'Alfred Bell': 1,
 'John Meillon Jr.': 1,
 'Mickey Rourke': 35,
 'Steve Guttenberg': 11,
 'Ellen Barkin': 20,
 'Daniel Stern': 9,
 'Kevin Bacon': 35,
 'Timothy Daly': 5,
 'Paul Reiser': 10,
 'Kelle Kipp': 1,
 'Colette Blonigan': 1,
 'Eduardo De Filippo': 18,
 'Peppino De Filippo': 51,
 'Clelia Matania': 19,
 'Paolo Stoppa': 44,
 'Nario Bernardi': 1,
 'Augusto Di Giovanni': 3,
 'Edwige Maul': 1,
 'Italia Marchesini': 1,
 'Fabio Testi': 23,
 'Janet Agren': 25,
 'Assaf Dayan': 1,
 'Siegfried Rauch': 4,
 'Oded Kotler': 1,
 'Gianni Rizzo': 5,
 'Herbert Fux': 6,
 'Gian Maria Volonté': 28,
 'Irene Papas': 20,
 'Gabriele Ferzetti': 39,
 'Salvo Randone': 13,
 'Laura Nucci': 2,
 'Mario Scaccia'

In [7]:
def sort_dict_by_value(d, reverse = False):
    return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))

actors_dict=sort_dict_by_value(actors_dict, reverse=True)

In [8]:
top_100_actors=list(actors_dict.keys())[:100]
top_100_actors

['Alberto Sordi',
 'Marcello Mastroianni',
 'Ugo Tognazzi',
 'Vittorio Gassman',
 'Totò',
 'Ciccio Ingrassia',
 'Vittorio De Sica',
 'Franco Franchi',
 'Michael Caine',
 'Gérard Depardieu',
 'Robert De Niro',
 'Giancarlo Giannini',
 'Donald Sutherland',
 'Luca Zingaretti',
 'Michele Placido',
 'Samuel L. Jackson',
 'Lino Banfi',
 'Harvey Keitel',
 'Nino Manfredi',
 'Rutger Hauer',
 'Stefania Sandrelli',
 'Willem Dafoe',
 'Christopher Walken',
 'Nicolas Cage',
 'Julianne Moore',
 'Morgan Freeman',
 'Bruce Willis',
 'Mario Carotenuto',
 'Peppino De Filippo',
 'Christian De Sica',
 'Sophia Loren',
 'Alain Delon',
 'Valerio Mastandrea',
 'Antonio Banderas',
 'Franco Nero',
 'Alessandro Haber',
 'Liam Neeson',
 'Walter Chiari',
 'Sylva Koscina',
 'Diego Abatantuono',
 'Tomas Milian',
 'John Wayne',
 'Gene Hackman',
 'Robert Duvall',
 'Paolo Villaggio',
 'Claudia Cardinale',
 'Ornella Muti',
 'Margherita Buy',
 'Max Von Sydow',
 'Susan Sarandon',
 'John Turturro',
 'Paolo Stoppa',
 'Robert D

In [9]:
def has_top_actor(actor_list, top_list): 
    for each_actor in actor_list: 
        if each_actor in top_list: 
            return True
    return False

In [10]:
# get binary indicator
tvfilm_df['has_top_actor']=tvfilm_df['actors'].fillna('').apply(lambda x: has_top_actor(x.split(', '), top_100_actors))


In [11]:
# Creating a threshold for 'avg_vote' column
threshold = 7.0

# Creating a new column determining the films success
tvfilm_df['popular'] = tvfilm_df['avg_vote'].apply(lambda x: 0
                                                         if x >= threshold else 
                                                         1)

In [12]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

X = tvfilm_df[['effort', 'humor', 'rhythm', 'tension', 'erotism', 'year', 'duration', 'country_grouped', 'genre_grouped', 'has_top_actor']]
y = tvfilm_df['popular']

# creates one-hot encoding for desired columns
X = pd.get_dummies(X, columns=['genre_grouped', 'country_grouped'])

In [13]:
tvfilm_df.head()

Unnamed: 0,filmtv_id,title,year,genre,duration,country,directors,actors,avg_vote,critics_vote,...,humor,rhythm,effort,tension,erotism,genre_grouped,is_us,country_grouped,has_top_actor,popular
0,3,18 anni tra una settimana,1991,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",6.5,6.0,...,0,2,0,2,0,Drama,False,Italy,True,1
1,17,Ride a Wild Pony,1976,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.6,6.0,...,1,2,1,0,0,999,True,United States,False,1
2,18,Diner,1982,Comedy,95,United States,Barry Levinson,"Mickey Rourke, Steve Guttenberg, Ellen Barkin,...",7.0,8.0,...,2,2,0,1,2,Comedy,True,United States,False,0
3,20,A che servono questi quattrini?,1942,Comedy,85,Italy,Esodo Pratelli,"Eduardo De Filippo, Peppino De Filippo, Clelia...",5.9,5.33,...,3,1,1,0,0,Comedy,False,Italy,True,1
4,21,The Uranian Conspiracy,1978,Spy,117,"Italy, Germany, Israel","Gianfranco Baldanello, Menahem Golan","Fabio Testi, Janet Agren, Assaf Dayan, Siegfri...",4.8,3.5,...,1,2,0,2,0,999,False,other,False,1


In [14]:
X.describe()

Unnamed: 0,effort,humor,rhythm,tension,erotism,year,duration,genre_grouped_999,genre_grouped_Action,genre_grouped_Comedy,genre_grouped_Drama,genre_grouped_Horror,genre_grouped_Thriller,country_grouped_France,country_grouped_Great Britain,country_grouped_Italy,country_grouped_Japan,country_grouped_United States,country_grouped_other
count,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0,17073.0
mean,1.037428,0.891583,2.02501,1.373279,0.482692,1988.394834,102.544075,0.243191,0.046565,0.286007,0.302173,0.048732,0.073332,0.051016,0.045803,0.228665,0.013706,0.41715,0.24366
std,1.233634,0.981624,0.82701,1.097787,0.742706,22.106267,22.918516,0.429022,0.210711,0.451906,0.459213,0.215313,0.260689,0.220037,0.209064,0.419985,0.11627,0.493103,0.429302
min,0.0,0.0,0.0,0.0,0.0,1913.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,0.0,0.0,1971.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,2.0,1.0,0.0,1996.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,2.0,2.0,2.0,1.0,2006.0,110.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,5.0,5.0,5.0,5.0,4.0,2022.0,729.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# Check the balance of our target values
y.value_counts()

1    12887
0     4186
Name: popular, dtype: int64

In [16]:
# Testing and training model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(12804, 20)

In [17]:
### OVERSAMPLING
## Native Random Oversampling
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 9665, 1: 9665})

In [18]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [19]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.726521105793088

In [20]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 720,  327],
       [ 756, 2466]])

In [21]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.69      0.77      0.57      0.73      0.52      1047
          1       0.88      0.77      0.69      0.82      0.73      0.53      3222

avg / total       0.79      0.75      0.71      0.76      0.73      0.53      4269



In [22]:
### OVERSAMPLING: DURATION ###
## SMOTE Oversampleing
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 9665, 1: 9665})

In [23]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [24]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6858071033848595

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 599,  448],
       [ 646, 2576]])

In [26]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.57      0.80      0.52      0.68      0.45      1047
          1       0.85      0.80      0.57      0.82      0.68      0.47      3222

avg / total       0.76      0.74      0.63      0.75      0.68      0.46      4269



In [27]:
### UNDERSAMPLING
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3139, 1: 3139})

In [28]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6858071033848595

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 599,  448],
       [ 646, 2576]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.57      0.80      0.52      0.68      0.45      1047
          1       0.85      0.80      0.57      0.82      0.68      0.47      3222

avg / total       0.76      0.74      0.63      0.75      0.68      0.46      4269



In [32]:
### COMBINATION OF (OVER AND UNDER) SAMPBLING
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 10738, 1: 7239})

In [33]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6858071033848595

In [35]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 799,  248],
       [1053, 2169]])

In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.43      0.76      0.67      0.55      0.72      0.52      1047
          1       0.90      0.67      0.76      0.77      0.72      0.51      3222

avg / total       0.78      0.70      0.74      0.72      0.72      0.51      4269

