In [1]:
# Import Pandas
import pandas as pd

# Import preprocessing methods and train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Import Counter
from collections import Counter

# Import Sampling Methods
from imblearn.over_sampling import RandomOverSampler

# Import Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
import tensorflow as tf

# Import Methods for Metric Reporting
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Import datetime
from datetime import datetime, date

# Set start time to current time
start_time = datetime.now()

## Inspecting the Data Before Preprocessing

In [3]:
# Load and inspect the merged_spotify_songs.csv dataset
spotify_df = pd.read_csv("../Resources/merged_spotify_songs.csv")
spotify_df.head(10)

Unnamed: 0,id,name,artists,release_date,year,duration_ms,acousticness,danceability,energy,explicit,...,key,liveness,loudness,loudness_scaled,mode,popularity,speechiness,tempo,tempo_scaled,valence
0,02GDntOXexBFUvSgaXLPkd,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.02,0.0456,92.867,0.380461,0.0731
1,08zfJvRLp7pjAb94MA9JmF,Il Etait Syndiqué,['Fortugé'],1921-01-01,1921,196560,0.982,1,0.257,0,...,8,0.504,-16.415,0.682562,1,0.0,0.399,109.378,0.448103,0.771
2,0BMkRpQtDoKjcgzCpnqLNa,Dans La Vie Faut Pas S'en Faire,['Maurice Chevalier'],1921-01-01,1921,147133,0.995,0,0.26,0,...,9,0.258,-16.894,0.675061,1,0.0,0.0557,85.146,0.348829,0.826
3,0eQsdik7GTEy7M3UytCbSN,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.0,0.0456,92.867,0.380461,0.0731
4,0H3k2CvJvHULnWChlbeFgx,La Vipère,['Georgel'],1921-01-01,1921,190800,0.99,0,0.363,0,...,5,0.292,-12.562,0.742902,0,0.0,0.0546,174.532,0.715028,0.493
5,0i7MdVu0tNEyUdgpCBilKe,Je M'donne,['Maurice Chevalier'],1921-01-01,1921,181733,0.996,1,0.274,0,...,2,0.302,-14.001,0.720366,1,0.0,0.041,79.218,0.324543,0.678
6,0LcXzABeA84EgudqpNUN1I,Ud Taksimi,['Mehmet Kemiksiz'],1921-01-01,1921,184973,0.912,0,0.42,0,...,8,0.108,-10.766,0.771028,0,0.0,0.114,70.758,0.289884,0.212
7,0NFeJgmTAV1kDfzSQNK41Z,10 Préludes Op. 23: No. 5 in G Minor. Alla marcia,['Sergei Rachmaninoff' 'Ruth Laredo'],1921-01-01,1921,221013,0.989,0,0.171,0,...,7,0.116,-20.476,0.618965,0,0.03,0.0319,107.698,0.441221,0.282
8,0Nk5f07H3JaEunGrYfbqHM,Come Back To Erin,['Phil Regan'],1921-01-01,1921,186467,0.957,0,0.212,0,...,2,0.236,-13.3,0.731344,1,0.01,0.0358,85.726,0.351205,0.218
9,0osXBirvQzPRfKSUDzHPCv,Korkma Sönmez,['Mehmet Kemiksiz'],1921-01-01,1921,155063,0.388,1,0.698,0,...,4,0.0768,-8.184,0.811463,0,0.0,0.0421,133.951,0.548775,0.723


In [4]:
# Find the dimensions of the dataset
spotify_df.shape

(169797, 21)

In [5]:
# Generate descriptive statistics
spotify_df.describe()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,loudness_scaled,mode,popularity,speechiness,tempo,tempo_scaled,valence
count,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0,169797.0
mean,1977.221712,231407.8,0.493309,0.601047,0.488541,0.084895,0.161962,5.200551,0.206696,-11.370476,0.761562,0.70857,0.31555,0.094066,116.946318,0.47911,0.532034
std,25.599875,121339.9,0.376619,0.489684,0.267381,0.278727,0.309349,3.515181,0.176796,5.667187,0.088751,0.454423,0.215861,0.149961,30.728301,0.125889,0.262407
min,1921.0,5108.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1957.0,171040.0,0.0945,0.0,0.263,0.0,0.0,2.0,0.0984,-14.47,0.713022,0.0,0.12,0.0349,93.512,0.383103,0.322
50%,1978.0,208600.0,0.493,1.0,0.481,0.0,0.000204,5.0,0.135,-10.474,0.775601,1.0,0.33,0.045,114.773,0.470206,0.544
75%,1999.0,262960.0,0.888,1.0,0.71,0.0,0.0868,8.0,0.263,-7.118,0.828158,1.0,0.48,0.0754,135.708,0.555973,0.749
max,2020.0,5403500.0,0.996,1.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,1.0,1.0,0.969,244.091,1.0,1.0


In [6]:
# Confirm data types
spotify_df.dtypes

id                   object
name                 object
artists              object
release_date         object
year                  int64
duration_ms           int64
acousticness        float64
danceability          int64
energy              float64
explicit              int64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
loudness_scaled     float64
mode                  int64
popularity          float64
speechiness         float64
tempo               float64
tempo_scaled        float64
valence             float64
dtype: object

In [7]:
# Find null values
for column in spotify_df.columns:
    print(f"Column {column} has {spotify_df[column].isnull().sum()} null values")

Column id has 0 null values
Column name has 0 null values
Column artists has 0 null values
Column release_date has 0 null values
Column year has 0 null values
Column duration_ms has 0 null values
Column acousticness has 0 null values
Column danceability has 0 null values
Column energy has 0 null values
Column explicit has 0 null values
Column instrumentalness has 0 null values
Column key has 0 null values
Column liveness has 0 null values
Column loudness has 0 null values
Column loudness_scaled has 0 null values
Column mode has 0 null values
Column popularity has 0 null values
Column speechiness has 0 null values
Column tempo has 0 null values
Column tempo_scaled has 0 null values
Column valence has 0 null values


In [8]:
# Check for duplicates
print(f"Duplicated entries: {spotify_df.duplicated().sum()}")

Duplicated entries: 0


In [9]:
# Create a new DataFrame that holds the string columns.
song_names_df = spotify_df[["id", "name", "artists", "release_date", "year", "duration_ms"]].copy()
song_names_df 

Unnamed: 0,id,name,artists,release_date,year,duration_ms
0,02GDntOXexBFUvSgaXLPkd,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773
1,08zfJvRLp7pjAb94MA9JmF,Il Etait Syndiqué,['Fortugé'],1921-01-01,1921,196560
2,0BMkRpQtDoKjcgzCpnqLNa,Dans La Vie Faut Pas S'en Faire,['Maurice Chevalier'],1921-01-01,1921,147133
3,0eQsdik7GTEy7M3UytCbSN,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773
4,0H3k2CvJvHULnWChlbeFgx,La Vipère,['Georgel'],1921-01-01,1921,190800
...,...,...,...,...,...,...
169792,3xuUW0IwM2VbVgZSbVORzR,"Ik Vaari Aa (From 'Raabta'"")""",['Arijit Singh'],2020-06-16,2020,274687
169793,4IJOll4wTwDrv0HxGLvWjb,Born This Way,['Lady Gaga'],2020-06-16,2020,261083
169794,2r6agJaA9OsWpQk6cIsy17,Everybody's Gotta Learn Sometime,['G-Eazy'],2020-06-17,2020,177657
169795,345aCD2ReEBsMqKE392DKM,Everybody's Gotta Learn Sometime,['G-Eazy'],2020-06-17,2020,177657


In [10]:
# Drop non numberical and repeat columns from main DataFrame
spotify_df = spotify_df.drop(columns=["id", "name", "artists", "loudness_scaled", "tempo_scaled"])
spotify_df.head()

Unnamed: 0,release_date,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,1921-01-01,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.02,0.0456,92.867,0.0731
1,1921-01-01,1921,196560,0.982,1,0.257,0,0.0,8,0.504,-16.415,1,0.0,0.399,109.378,0.771
2,1921-01-01,1921,147133,0.995,0,0.26,0,0.0,9,0.258,-16.894,1,0.0,0.0557,85.146,0.826
3,1921-01-01,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.0,0.0456,92.867,0.0731
4,1921-01-01,1921,190800,0.99,0,0.363,0,0.0,5,0.292,-12.562,0,0.0,0.0546,174.532,0.493


## Feature Engineering

In [11]:
# Extract month from release_date
spotify_df["release_date"] = pd.to_datetime(spotify_df["release_date"])
spotify_df["month"] = spotify_df["release_date"].dt.month
spotify_df = spotify_df.drop(columns=["release_date"])
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,month
0,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.02,0.0456,92.867,0.0731,1
1,1921,196560,0.982,1,0.257,0,0.0,8,0.504,-16.415,1,0.0,0.399,109.378,0.771,1
2,1921,147133,0.995,0,0.26,0,0.0,9,0.258,-16.894,1,0.0,0.0557,85.146,0.826,1
3,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.0,0.0456,92.867,0.0731,1
4,1921,190800,0.99,0,0.363,0,0.0,5,0.292,-12.562,0,0.0,0.0546,174.532,0.493,1


In [12]:
# Removing outliers in duration_ms with Percentiles - remove top 5% and bottom 1% of duration
upper_lim = spotify_df['duration_ms'].quantile(.95)
lower_lim = spotify_df['duration_ms'].quantile(.01)

spotify_df = spotify_df[(spotify_df['duration_ms'] < upper_lim) & (spotify_df['duration_ms'] > lower_lim)]

## Data Preprocessing for Machine Learning

In [13]:
# key column represents the key the track is in:
# i.e 0 = C, 1 = C#/D♭, 2 = D, 3 = D#/E♭, ... , 11 = B
# Therefore, key column should be encoded

# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded Dataframe
encode_df = pd.DataFrame(enc.fit_transform(spotify_df.key.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['key'])
encode_df.head(10)

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Merge the two DataFrames together and drop the key column
spotify_df = spotify_df.merge(encode_df,left_index=True,right_index=True).drop(columns=["key"])
spotify_df.head(10)

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1921,196560,0.982,1,0.257,0,0.0,0.504,-16.415,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1921,147133,0.995,0,0.26,0,0.0,0.258,-16.894,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1921,190800,0.99,0,0.363,0,0.0,0.292,-12.562,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1921,181733,0.996,1,0.274,0,0.0,0.302,-14.001,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1921,184973,0.912,0,0.42,0,0.89,0.108,-10.766,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1921,221013,0.989,0,0.171,0,0.82,0.116,-20.476,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,1921,186467,0.957,0,0.212,0,0.000222,0.236,-13.3,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1921,155063,0.388,1,0.698,0,2e-06,0.0768,-8.184,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Encoding the month column

# Fit the encoder and produce encoded Dataframe
encode_df = pd.DataFrame(enc.fit_transform(spotify_df.month.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['month'])
encode_df.head(10)

Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Merge the two DataFrames together and drop the month column
spotify_df = spotify_df.merge(encode_df,left_index=True,right_index=True).drop(columns=["month"])
spotify_df.head(10)

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1921,196560,0.982,1,0.257,0,0.0,0.504,-16.415,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1921,147133,0.995,0,0.26,0,0.0,0.258,-16.894,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1921,190800,0.99,0,0.363,0,0.0,0.292,-12.562,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1921,181733,0.996,1,0.274,0,0.0,0.302,-14.001,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1921,184973,0.912,0,0.42,0,0.89,0.108,-10.766,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1921,221013,0.989,0,0.171,0,0.82,0.116,-20.476,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1921,186467,0.957,0,0.212,0,0.000222,0.236,-13.3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1921,155063,0.388,1,0.698,0,2e-06,0.0768,-8.184,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
spotify_df.columns

Index(['year', 'duration_ms', 'acousticness', 'danceability', 'energy',
       'explicit', 'instrumentalness', 'liveness', 'loudness', 'mode',
       'popularity', 'speechiness', 'tempo', 'valence', 'key_0', 'key_1',
       'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9',
       'key_10', 'key_11', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12'],
      dtype='object')

## Split the data into input (X) and output (y)

In [18]:
# Separate the dataset into features (X) and target (y)
y = spotify_df["danceability"]
X = spotify_df.drop(columns=["danceability"])

In [19]:
# Check the balance of our target values
y.value_counts()

1    83312
0    56299
Name: danceability, dtype: int64

## Split the Data into Training and Testing

In [20]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, train_size=0.75)

## Scaling the data

In [21]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
X_train_scaled[0]

array([ 0.89416493,  0.04941937, -0.98016795, -0.09715232, -0.2447985 ,
       -0.52307173, -0.56263261,  0.25661048,  0.62143512,  0.35011613,
       -0.40751599, -0.34450061, -0.62435278, -0.37980216, -0.27963683,
       -0.3535325 , -0.2113206 , -0.28971322,  3.07017399, -0.22829729,
       -0.37666799, -0.25745635, -0.34421508, -0.27423972, -0.25794102,
        0.91712425, -0.17159719, -0.19983904, -0.18689549, -0.19983904,
       -0.21487166, -0.19395085, -0.19511421, -0.22452127, -0.22629905,
       -0.23441401, -0.23382397])

In [23]:
# Determine the shape of our training and testing sets.
print(X_train_scaled.shape)
print(X_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)

(104708, 37)
(34903, 37)
(104708,)
(34903,)


## Machine Learning Models:
1. Logistic Regression
2. Decision Tree
3. Balanced Random Forest Classifier
4. Easy Ensemble AdaBoost Classifier
5. Deep Neural Network


## 1. Naive Random Oversampling + Logistic Regression

In [24]:
# Examine Data with Counter: Data needs to be resampled for Logistic Regression
Counter(y_train)

Counter({1: 62484, 0: 42224})

### Resample data with RandomOverSampler

In [25]:
# Resample the data with RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({1: 62484, 0: 62484})

### Define, Train, and Make Predictions for a Logistic Regression Model

In [26]:
# Define the logistic regression model
model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Train the Logistic Regression model with the resampled data
model.fit(X_resampled, y_resampled)

# Create Make
y_pred = model.predict(X_test_scaled)

### Create the Confusion Matrix

In [27]:
# Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

### Validate the Model

In [28]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [29]:
# Displaying results
print("Model: Logistic Regression\n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Logistic Regression

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10756,3319
Actual 1,4944,15884


Balanced Accuracy Score: 0.7634095310282203

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.76      0.72     14075
           1       0.83      0.76      0.79     20828

    accuracy                           0.76     34903
   macro avg       0.76      0.76      0.76     34903
weighted avg       0.77      0.76      0.76     34903



## 2. Decision Tree

### Define, Train, and Make Predictions for a Decision Tree Model

In [30]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier(random_state=1)

# Fitting the model.
model = model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
y_pred = model.predict(X_test_scaled)

### Create the Confusion Matrix

In [31]:
# Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

### Validate the Model

In [32]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [33]:
# Displaying results
print("Model: Decision Tree \n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Decision Tree 

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9701,4374
Actual 1,4330,16498


Balanced Accuracy Score: 0.7406715068968845

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.69      0.69     14075
           1       0.79      0.79      0.79     20828

    accuracy                           0.75     34903
   macro avg       0.74      0.74      0.74     34903
weighted avg       0.75      0.75      0.75     34903



##  3. Balanced Random Forest Classifier

### Define, Train, and Make Predictions for a Balanced Random Forest Classifier Model

In [34]:
# Define the Balanced Random Forest Classifier Model
rf_model = BalancedRandomForestClassifier(n_estimators=256, random_state=1)

# Resample the training data with BalancedRandomForestClassifier
rf_model.fit(X_train_scaled, y_train)

# Make prediction
y_pred = rf_model.predict(X_test_scaled)

### Create the Confusion Matrix

In [35]:
# Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

### Validate the Model

In [36]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [37]:
# Displaying results
print("Model: Balanced Random Forest Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Balanced Random Forest Classifier

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11593,2482
Actual 1,3831,16997


Balanced Accuracy Score: 0.8198619412111241

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79     14075
           1       0.87      0.82      0.84     20828

    accuracy                           0.82     34903
   macro avg       0.81      0.82      0.81     34903
weighted avg       0.82      0.82      0.82     34903



##  4. Easy Ensemble AdaBoost Classifier

### Define, Train, and Make Predictions for an Easy Ensemble AdaBoost Classifier Model

In [38]:
# Define the Balanced Random Forest Classifier Model
eec = EasyEnsembleClassifier(n_estimators=50, random_state=1)

# Resample the training data with BalancedRandomForestClassifier
eec.fit(X_train_scaled, y_train)

# Make prediction
y_pred = eec.predict(X_test_scaled)

### Create the Confusion Matrix

In [39]:
# Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

### Validate the Model

In [40]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [41]:
# Displaying results
print("Model: Easy Ensemble AdaBoost Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Easy Ensemble AdaBoost Classifier

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11119,2956
Actual 1,4202,16626


Balanced Accuracy Score: 0.7941172953064617

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76     14075
           1       0.85      0.80      0.82     20828

    accuracy                           0.79     34903
   macro avg       0.79      0.79      0.79     34903
weighted avg       0.80      0.79      0.80     34903



## 5. Deep Neural Network

### Define a Deep Neural Network Model

In [42]:
# Define the basic neural network model
number_input_features = len(X_train_scaled[0]) # X_train.values[0]
hidden_nodes_layer1 = 120
hidden_nodes_layer2 = 60

nn = tf.keras.models.Sequential()

# First Hidden Layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, activation="relu", input_dim=number_input_features))

# Second Hidden Layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output Layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 120)               4560      
                                                                 
 dense_1 (Dense)             (None, 60)                7260      
                                                                 
 dense_2 (Dense)             (None, 1)                 61        
                                                                 
Total params: 11,881
Trainable params: 11,881
Non-trainable params: 0
_________________________________________________________________


### Compile, Train, and Evaluate the Model

In [43]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
1091/1091 - 1s - loss: 0.4595 - accuracy: 0.7980 - 1s/epoch - 994us/step


In [44]:
# Print the results of the model
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.45948246121406555, Accuracy: 0.7980116605758667


In [45]:
# Set endtime to current time and get duration
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:12:03.016035
