In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
from helpers.encoders import weighted_mean_k_fold_target_encoding, clean_and_split_string, weighted_average_encoding

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
# Feature Ideas:
# 1. use target encoding for genres, directors, authors, actors, production_company
# 3. use frequency encoding for tomatometer_top_critics_count, tomatometer_fresh_critics_count, tomatometer_rotten_critics_count, tomatometer_count
# 5. use seasonal ecoding for original_release_date
# 6. create feature of streaming_release_date - original_release_date named 'original_to_streaming_days'
# 8. one-hot encode the following variables: content_rating, audience_status, top_critic
# 10. Label encode for the target variable tomatometer_status; note: tomatometer_rating is probably a leaky feature
# 11. Use either audience_status or audience_rating, not both
# 12. For audience_rating, use a mean weighted by audience_count (n * option mean + m * overall mean) / (n + m) where n is the audience_count and m is a hyperparameter; 
#       common values for m are 1, 10, 100, 1000. m has a regularization effect. Therefore:  (smaller n = larger m) and variance of option means (larger variance = larger m)
# 13. Use label encoding for content_rating, as it is ordinal
# 14. create feature reviews_avg_days_after_release = sum(review_date - original_release_date) / count(reviews)
# 15. when training, drop movie_title, movie_info, critics_consensus

## Import Data

In [2]:
df_rotten_tomatoes_movies = pd.read_csv('datasets/Meta/rotten_tomatoes_movies.csv')
df_rotten_tomatoes_movies['original_release_date'] = pd.to_datetime(df_rotten_tomatoes_movies['original_release_date'])
df_rotten_tomatoes_movies['streaming_release_date'] = pd.to_datetime(df_rotten_tomatoes_movies['streaming_release_date'])
# convert 'original_release_date' to seasonal encoding
df_rotten_tomatoes_movies['original_release_month'] = df_rotten_tomatoes_movies['original_release_date'].dt.month
movie_seasons = {1: {1, 2}, 2: {3, 4}, 3: {5, 6, 7, 8}, 4: {9,}, 5: {10, 11, 12}}
df_rotten_tomatoes_movies['original_release_season'] = df_rotten_tomatoes_movies['original_release_month'].map({month: season for season, months in movie_seasons.items() for month in months})
# drop columns: movie_title, movie_info, critics_consensus
df_rotten_tomatoes_movies.drop(columns=['movie_title', 'movie_info', 'critics_consensus'], inplace=True)
# split genres into list
df_rotten_tomatoes_movies['genres'] = df_rotten_tomatoes_movies['genres'].apply(clean_and_split_string)
# drop rows with missing tomatometer_rating
df_rotten_tomatoes_movies.dropna(subset=['tomatometer_rating'], inplace=True)
# the time between the original release date and the streaming release date could be a useful feature
df_rotten_tomatoes_movies['original_to_streaming_days'] = (df_rotten_tomatoes_movies['streaming_release_date'] - df_rotten_tomatoes_movies['original_release_date']).dt.days
# Label encode the target variable tomatometer_status
df_rotten_tomatoes_movies['tomatometer_status'] = df_rotten_tomatoes_movies['tomatometer_status'].map({'Rotten': 0, 'Fresh': 1, 'Certified-Fresh': 2})
print(f'shape: {df_rotten_tomatoes_movies.shape}')
df_rotten_tomatoes_movies.head(2)

shape: (17668, 22)


Unnamed: 0,rotten_tomatoes_link,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,original_release_month,original_release_season,original_to_streaming_days
0,m/0814255,PG,"[Action & Adventure, Comedy, Drama, Science Fi...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2015-11-25,119.0,20th Century Fox,0,49.0,149.0,Spilled,53.0,254421.0,43,73,76,2.0,1.0,2112.0
1,m/0878835,R,[Comedy],Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2012-09-04,90.0,Sony Pictures Classics,2,87.0,142.0,Upright,64.0,11574.0,44,123,19,4.0,2.0,858.0


In [3]:
df_rotten_tomatoes_movies.info()
# df_rotten_tomatoes_movies.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 17668 entries, 0 to 17711
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   rotten_tomatoes_link              17668 non-null  object        
 1   content_rating                    17668 non-null  object        
 2   genres                            17668 non-null  object        
 3   directors                         17475 non-null  object        
 4   authors                           16134 non-null  object        
 5   actors                            17316 non-null  object        
 6   original_release_date             16514 non-null  datetime64[ns]
 7   streaming_release_date            17316 non-null  datetime64[ns]
 8   runtime                           17384 non-null  float64       
 9   production_company                17175 non-null  object        
 10  tomatometer_status                17668 non-null  i

In [4]:
# read in csv file
df_rotten_tomatoes_critic_reviews = pd.read_csv('datasets/Meta/rotten_tomatoes_critic_reviews_50k.csv')
df_rotten_tomatoes_critic_reviews['review_date'] = pd.to_datetime(df_rotten_tomatoes_critic_reviews['review_date'])
# Label encode review_type with 0 for 'Rotten' and 1 for 'Fresh'
df_rotten_tomatoes_critic_reviews['review_type'] = df_rotten_tomatoes_critic_reviews['review_type'].map({'Rotten': 0, 'Fresh': 1})
# drop rows with missing critic_name
df_rotten_tomatoes_critic_reviews['critic_name'].dropna(inplace=True)
# Frequency encode critic_name
df_rotten_tomatoes_critic_reviews['tomatometer_critics_count'] = df_rotten_tomatoes_critic_reviews['critic_name'].map(df_rotten_tomatoes_critic_reviews['critic_name'].value_counts().to_dict())
# Label encode top_critic with 0 for False and 1 for True
df_rotten_tomatoes_critic_reviews['top_critic'] = df_rotten_tomatoes_critic_reviews['top_critic'].map({False: 0, True: 1})
# df_rotten_tomatoes_critic_reviews['review_score_numerator'] = df_rotten_tomatoes_critic_reviews['review_score'].apply(
#     lambda x: str(x).split('/')[0] if isinstance(x, str) else np.nan
# )
# df_rotten_tomatoes_critic_reviews['review_score_denominator'] = df_rotten_tomatoes_critic_reviews['review_score'].apply(
#     lambda x: str(x).split('/')[1] if isinstance(x, str) and '/' in x else np.nan
# )
# fill na for review_score with mean of review_score
# df_rotten_tomatoes_critic_reviews['review_score'].fillna(df_rotten_tomatoes_critic_reviews['review_score'].mean(), inplace=True)
print(f'shape: {df_rotten_tomatoes_critic_reviews.shape}')
df_rotten_tomatoes_critic_reviews

shape: (50000, 9)


Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,tomatometer_critics_count
0,m/0814255,Andrew L. Urban,0,Urban Cinefile,1,,2010-02-06,A fantasy adventure that fuses Greek mythology...,46.0000
1,m/0814255,Louise Keller,0,Urban Cinefile,1,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...",46.0000
2,m/0814255,,0,FILMINK (Australia),1,,2010-02-09,With a top-notch cast and dazzling special eff...,
3,m/0814255,Ben McEachen,0,Sunday Mail (Australia),1,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...,7.0000
4,m/0814255,Ethan Alter,1,Hollywood Reporter,0,,2010-02-10,What's really lacking in The Lightning Thief i...,25.0000
...,...,...,...,...,...,...,...,...,...
49995,m/1110242-collateral_damage,Chris Hewitt,0,St. Paul Pioneer Press,0,,2002-02-07,The smarter an Arnold Schwarzenegger movie tri...,143.0000
49996,m/1110242-collateral_damage,Nick Carter,0,Milwaukee Journal Sentinel,0,,2002-02-07,A relative letdown.,2.0000
49997,m/1110242-collateral_damage,Gary Dowell,1,Dallas Morning News,0,,2002-02-07,The movie straddles the fence between escapism...,10.0000
49998,m/1110242-collateral_damage,Walter Chaw,0,Film Freak Central,0,0/4,2002-02-07,Enough similarities to Gymkata and Howie Long'...,167.0000


In [5]:
# critic_value_counts = df_rotten_tomatoes_critic_reviews['critic_name'].value_counts()
# # only include critics that are more than 1 standard deviations from the mean
# critic_that_review_alot = critic_value_counts[~(critic_value_counts < critic_value_counts.mean() + 1 * critic_value_counts.std())]
# critic_that_review_alot
# # filter df_rotten_tomatoes_critic_reviews to only include critics that review alot
# df_rotten_tomatoes_big_critic_reviews = df_rotten_tomatoes_critic_reviews[df_rotten_tomatoes_critic_reviews['critic_name'].isin(critic_that_review_alot.index)]
# df_rotten_tomatoes_big_critic_reviews

In [6]:
df_rotten_tomatoes_critic_reviews.info()
# df_rotten_tomatoes_critic_reviews.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   rotten_tomatoes_link       50000 non-null  object        
 1   critic_name                48068 non-null  object        
 2   top_critic                 50000 non-null  int64         
 3   publisher_name             50000 non-null  object        
 4   review_type                50000 non-null  int64         
 5   review_score               36409 non-null  object        
 6   review_date                50000 non-null  datetime64[ns]
 7   review_content             41195 non-null  object        
 8   tomatometer_critics_count  48068 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 3.4+ MB


## Feature Engineering

In [7]:
# # portion of critics that are top critics
# df_rotten_tomatoes_critic_reviews_grouped = df_rotten_tomatoes_critic_reviews.groupby('rotten_tomatoes_link')['top_critic'].sum().reset_index()

In [None]:
# scale tomatometer_top_critics_count, tomatometer_fresh_critics_count, tomatometer_rotten_critics_count by tomatometer_critics_count
df_rotten_tomatoes_critic_reviews_grouped = df_rotten_tomatoes_critic_reviews.groupby('rotten_tomatoes_link')['tomatometer_critics_count'].count().reset_index()
df_rotten_tomatoes_critic_reviews_grouped

Unnamed: 0,rotten_tomatoes_link,tomatometer_critics_count
0,m/0814255,146
1,m/0878835,141
2,m/10,23
3,m/1000013-12_angry_men,52
4,m/1000079-20000_leagues_under_the_sea,25
...,...,...
1361,m/1109942-big_trouble,109
1362,m/1110008-sidewalks_of_new_york,90
1363,m/1110048-pornographer,7
1364,m/1110236-wash,45


#### Target Encoding for genres, directors, authors, actors, production_company

In [9]:
# Apply weighted mean, k-fold target encoding to genres
df_rotten_tomatoes_movies_exploded = df_rotten_tomatoes_movies.explode('genres').copy()
df_rotten_tomatoes_movies_exploded['genre_target_score'] = weighted_mean_k_fold_target_encoding(
    df=df_rotten_tomatoes_movies_exploded,  # Explode genres to rows
    encode_col='genres',
    target_col='tomatometer_rating',
    k=5,
    smoothing=10
)

# Aggregate back with mean of genre_target_score
df_rotten_tomatoes_genre_grouped = df_rotten_tomatoes_movies_exploded.groupby('rotten_tomatoes_link').agg(
    {'genre_target_score': 'mean'}  # , 'genres': lambda x: x.mode()
).reset_index()
df_rotten_tomatoes_genre_grouped.head(2)

Unnamed: 0,rotten_tomatoes_link,genre_target_score
0,m/0814255,57.1677
1,m/0878835,55.2458


In [10]:
df_rotten_tomatoes_movies_exploded.isnull().sum()

rotten_tomatoes_link                   0
content_rating                         0
genres                                19
directors                            393
authors                             3267
actors                               647
original_release_date               2192
streaming_release_date               733
runtime                              562
production_company                  1005
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                      706
audience_rating                      424
audience_count                       426
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
original_release_month              2192
original_release_season             2192
original_to_streaming_days          2709
genre_target_score                     0
dtype: int64

In [11]:
df_rotten_tomatoes_genre_grouped.describe()

Unnamed: 0,genre_target_score
count,17668.0
mean,61.9409
std,7.0615
min,51.1958
25%,56.7858
50%,59.8296
75%,64.4646
max,81.1163


In [12]:
# Apply weighted mean, k-fold target encoding to directors
df_rotten_tomatoes_movies_exploded = df_rotten_tomatoes_movies.explode('directors').copy()
df_rotten_tomatoes_movies_exploded['director_target_score'] = weighted_mean_k_fold_target_encoding(
    df=df_rotten_tomatoes_movies_exploded,  # Explode directors to rows
    encode_col='directors',
    target_col='tomatometer_rating',
    k=5,
    smoothing=10
)

# Aggregate back with mean of director_target_score
df_rotten_tomatoes_director_grouped = df_rotten_tomatoes_movies_exploded.groupby('rotten_tomatoes_link').agg(
    {'director_target_score': 'mean'}  # , 'directors': lambda x: x.mode()
).reset_index()
df_rotten_tomatoes_director_grouped.head(2)

Unnamed: 0,rotten_tomatoes_link,director_target_score
0,m/0814255,57.9249
1,m/0878835,64.6793


In [13]:
df_rotten_tomatoes_director_grouped.describe()

Unnamed: 0,director_target_score
count,17668.0
mean,61.2532
std,3.3176
min,46.0832
25%,60.5481
50%,60.9328
75%,62.6464
max,80.9342


In [14]:
# Apply weighted mean, k-fold target encoding to authors
df_rotten_tomatoes_movies_exploded = df_rotten_tomatoes_movies.explode('authors').copy()
df_rotten_tomatoes_movies_exploded['author_target_score'] = weighted_mean_k_fold_target_encoding(
    df=df_rotten_tomatoes_movies_exploded,  # Explode authors to rows
    encode_col='authors',
    target_col='tomatometer_rating',
    k=5,
    smoothing=10
)

# Aggregate back with mean of author_target_score
df_rotten_tomatoes_author_grouped = df_rotten_tomatoes_movies_exploded.groupby('rotten_tomatoes_link').agg(
    {'author_target_score': 'mean'}  # , 'authors': lambda x: x.mode()
).reset_index()
df_rotten_tomatoes_author_grouped.head(2)

Unnamed: 0,rotten_tomatoes_link,author_target_score
0,m/0814255,60.7258
1,m/0878835,62.3562


In [15]:
df_rotten_tomatoes_author_grouped.describe()

Unnamed: 0,author_target_score
count,17668.0
mean,62.2132
std,4.3505
min,44.8933
25%,60.7258
50%,60.8455
75%,61.1515
max,75.3336


In [16]:
# Apply weighted mean, k-fold target encoding to actors
df_rotten_tomatoes_movies_exploded = df_rotten_tomatoes_movies.explode('actors').copy()
df_rotten_tomatoes_movies_exploded['actor_target_score'] = weighted_mean_k_fold_target_encoding(
    df=df_rotten_tomatoes_movies_exploded,  # Explode actors to rows
    encode_col='actors',
    target_col='tomatometer_rating',
    k=5,
    smoothing=10
)

# Aggregate back with mean of actor_target_score
df_rotten_tomatoes_actor_grouped = df_rotten_tomatoes_movies_exploded.groupby('rotten_tomatoes_link').agg(
    {'actor_target_score': 'mean'}  # , 'actors': lambda x: x.mode()
).reset_index()
df_rotten_tomatoes_actor_grouped.head(2)

Unnamed: 0,rotten_tomatoes_link,actor_target_score
0,m/0814255,60.8991
1,m/0878835,60.7036


In [17]:
df_rotten_tomatoes_actor_grouped.describe()

Unnamed: 0,actor_target_score
count,17668.0
mean,61.2905
std,2.7902
min,60.1841
25%,60.8991
50%,60.9276
75%,60.9341
max,81.0866


In [18]:
# Apply weighted mean, k-fold target encoding to production_company
df_rotten_tomatoes_movies_exploded = df_rotten_tomatoes_movies.explode('production_company').copy()
df_rotten_tomatoes_movies_exploded['production_company_target_score'] = weighted_mean_k_fold_target_encoding(
    df=df_rotten_tomatoes_movies_exploded,  # Explode production_company to rows
    encode_col='production_company',
    target_col='tomatometer_rating',
    k=5,
    smoothing=10
)

# Aggregate back with mean of production_company_target_score
df_rotten_tomatoes_production_company_grouped = df_rotten_tomatoes_movies_exploded.groupby('rotten_tomatoes_link').agg(
    {'production_company_target_score': 'mean'}  # , 'production_company': lambda x: x.mode()
).reset_index()
df_rotten_tomatoes_production_company_grouped.head(2)

Unnamed: 0,rotten_tomatoes_link,production_company_target_score
0,m/0814255,53.4335
1,m/0878835,75.0383


In [19]:
df_rotten_tomatoes_production_company_grouped.describe()

Unnamed: 0,production_company_target_score
count,17668.0
mean,60.4178
std,8.1447
min,34.2483
25%,56.1854
50%,60.7356
75%,64.743
max,91.7091


#### Merge all the data into one dataframe

In [20]:
# genres, directors, authors, actors, production_company
df_movies = pd.merge(df_rotten_tomatoes_movies, df_rotten_tomatoes_genre_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies = pd.merge(df_movies, df_rotten_tomatoes_director_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies = pd.merge(df_movies, df_rotten_tomatoes_author_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies = pd.merge(df_movies, df_rotten_tomatoes_actor_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies = pd.merge(df_movies, df_rotten_tomatoes_production_company_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies = pd.merge(df_movies, df_rotten_tomatoes_critic_reviews_grouped, on='rotten_tomatoes_link', how='outer', validate='one_to_one')
df_movies['tomatometer_top_critics_count'] = df_movies['tomatometer_top_critics_count'] / df_movies['tomatometer_critics_count']
df_movies['tomatometer_fresh_critics_count'] = df_movies['tomatometer_fresh_critics_count'] / df_movies['tomatometer_critics_count']
df_movies['tomatometer_rotten_critics_count'] = df_movies['tomatometer_rotten_critics_count'] / df_movies['tomatometer_critics_count']
# bin 'audience_count' into bins based on a log scale
bins = [0, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
df_movies['audience_count'] = pd.cut(df_movies['audience_count'], bins=bins, labels=False, duplicates='drop')
# ratings with more critics should be believed more than ratings with fewer critics
df_movies['audience_rating_weighted'] = weighted_average_encoding(df_movies, 'audience_rating', ['audience_count',], 1)
# on-hot encode the following variables: content_rating, top_critic
df_movies = pd.get_dummies(df_movies, columns=['content_rating',], drop_first=True, dtype=int, dummy_na=False)
# drop columns
df_movies.drop(columns=['rotten_tomatoes_link', 'genres', 'directors', 'authors', 'actors', 'production_company', 'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count', 'tomatometer_critics_count', 'original_release_date', 'original_release_month', 'audience_status', 'audience_count', 'audience_rating', 'streaming_release_date', 'tomatometer_rating'], inplace=True)
# drop rows with missing values in genre_target_score
df_movies.dropna(subset=['genre_target_score', 'tomatometer_top_critics_count', 'original_to_streaming_days', 'original_release_season', 'runtime'], inplace=True)
print(f'shape: {df_movies.shape}')
df_movies.head()

shape: (1311, 17)


Unnamed: 0,runtime,tomatometer_status,tomatometer_count,tomatometer_top_critics_count,original_release_season,original_to_streaming_days,genre_target_score,director_target_score,author_target_score,actor_target_score,production_company_target_score,audience_rating_weighted,content_rating_NC17,content_rating_NR,content_rating_PG,content_rating_PG-13,content_rating_R
0,119.0,0.0,149.0,0.2945,1.0,2112.0,57.1677,57.9249,60.7258,60.8991,53.4335,65.1789,0,0,1,0,0
1,90.0,2.0,142.0,0.3121,2.0,858.0,55.2458,64.6793,62.3562,60.7036,75.0383,62.2991,0,0,0,0,1
2,122.0,1.0,24.0,0.087,5.0,12711.0,58.0453,61.0585,61.1604,60.7036,60.8109,62.2991,0,0,0,0,1
3,95.0,2.0,54.0,0.1154,2.0,21825.0,70.5514,64.6683,65.0474,60.9341,90.1416,65.1789,0,1,0,0,0
4,127.0,1.0,27.0,0.2,1.0,22806.0,57.5283,58.9589,60.7258,60.9341,57.373,62.2991,0,0,0,0,0


In [21]:
# look for data leakage
df_movies.corr()['tomatometer_status'].sort_values(ascending=False)

tomatometer_status                 1.0000
director_target_score              0.3412
original_to_streaming_days         0.3197
genre_target_score                 0.2962
content_rating_NR                  0.2032
production_company_target_score    0.1750
tomatometer_count                  0.1491
runtime                            0.1375
original_release_season            0.1025
audience_rating_weighted           0.0947
author_target_score                0.0836
content_rating_NC17                0.0399
content_rating_PG                  0.0167
actor_target_score                 0.0036
tomatometer_top_critics_count     -0.0862
content_rating_R                  -0.1283
content_rating_PG-13              -0.1391
Name: tomatometer_status, dtype: float64

In [22]:
# look for multicollinearity
df_movies.corr().unstack().sort_values(ascending=False).drop_duplicates().head(50)

content_rating_R                 content_rating_R                  1.0000
content_rating_NR                original_to_streaming_days        0.5127
tomatometer_count                audience_rating_weighted          0.4866
original_to_streaming_days       director_target_score             0.4677
content_rating_NR                genre_target_score                0.4438
original_to_streaming_days       genre_target_score                0.3654
genre_target_score               director_target_score             0.3544
tomatometer_status               director_target_score             0.3412
director_target_score            content_rating_NR                 0.3284
tomatometer_status               original_to_streaming_days        0.3197
genre_target_score               author_target_score               0.3011
tomatometer_status               genre_target_score                0.2962
production_company_target_score  genre_target_score                0.2662
content_rating_NR                produ

# Build a Classifier with target variable: tomatometer_status

In [23]:
# Splitting features and target
X = df_movies.drop(columns=['tomatometer_status'])
y = df_movies['tomatometer_status']

# Split the dataset into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

In [24]:
# Define a list of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=500, random_state=42), # , class_weight='balanced'
    'Support Vector Machine': SVC(random_state=42), # class_weight='balanced', 
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), # , class_weight='balanced'
    'XGBoost': XGBClassifier(eval_metric='mlogloss', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42), # class_weight='balanced'
}

In [25]:
# Create pipelines for each classifier
pipelines = {}
for name, classifier in classifiers.items():
    if name in ['Logistic Regression', 'Support Vector Machine']:
        # Use scaling and SMOTE for classifiers that benefit from scaled data
        pipelines[name] = ImbPipeline(steps=[
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=42)),
            ('classifier', classifier),
        ])
    elif name in ['Random Forest', 'XGBoost', 'Decision Tree']:
        # For tree-based classifiers, skip scaling but use SMOTE
        pipelines[name] = ImbPipeline(steps=[
            ('smote', SMOTE(random_state=42)),
            ('classifier', classifier),
        ])
    else:
        raise ValueError(f'Unknown classifier: {name}')

In [26]:
# Evaluate each pipeline
for name, pipeline in pipelines.items():
    print(f"\n=== Evaluating {name} ===")
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Validation predictions
    y_pred = pipeline.predict(X_val)
    
    # Generate classification report
    print(classification_report(y_val, y_pred))

# Final test on the best model (replace with chosen classifier)
# Example: Using Random Forest
best_pipeline = pipelines['Random Forest']
best_pipeline.fit(X_train, y_train)
y_test_pred = best_pipeline.predict(X_test)

print("\n=== Test Set Performance (Random Forest) ===")
print(classification_report(y_test, y_test_pred))


=== Evaluating Logistic Regression ===


              precision    recall  f1-score   support

         0.0       0.63      0.73      0.68       108
         1.0       0.78      0.58      0.67       113
         2.0       0.45      0.56      0.50        41

    accuracy                           0.64       262
   macro avg       0.62      0.63      0.61       262
weighted avg       0.66      0.64      0.64       262


=== Evaluating Support Vector Machine ===
              precision    recall  f1-score   support

         0.0       0.70      0.78      0.74       108
         1.0       0.84      0.67      0.75       113
         2.0       0.59      0.73      0.65        41

    accuracy                           0.73       262
   macro avg       0.71      0.73      0.71       262
weighted avg       0.74      0.73      0.73       262


=== Evaluating Random Forest ===
              precision    recall  f1-score   support

         0.0       0.75      0.81      0.78       108
         1.0       0.90      0.72      0.80       11