# ISA 514 Project - SVM Regression Modeling

Joey Endres

## 1 - Setup and Preprocessing

In [35]:
# import the necessary libraries
import pandas as pd
import numpy as np

from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

In [2]:
# read in the data
df = pd.read_csv("model_ready_dataset.csv")
df.head(10)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,view_count,...,avg_word_len,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic
0,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,0.907,1118930000.0,...,4.846395,98,0.30721,0.001245,0.001247,0.001247,0.362314,0.385711,0.001244,0.246992
1,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,0.495,220560700.0,...,5.327024,428,0.450053,0.192193,0.030384,0.000396,0.01585,0.760386,0.000396,0.000396
2,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,0.275,87564090.0,...,5.063918,141,0.290722,0.000765,0.418962,0.230767,0.000766,0.347208,0.000765,0.000766
3,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,0.796,10499470.0,...,4.838269,132,0.300683,0.0009,0.000902,0.000901,0.000906,0.99459,0.0009,0.000901
4,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,0.709,21090600.0,...,5.375,117,0.365625,0.001012,0.324886,0.046997,0.624069,0.001012,0.001011,0.001014
5,0.00483,0.395,0.843,0.0,0.0404,-4.476,0.0374,112.423,0.481,56592740.0,...,4.977346,120,0.38835,0.001185,0.329814,0.001187,0.001189,0.188675,0.231365,0.246584
6,0.0129,0.599,0.543,0.00204,0.291,-9.226,0.0302,91.105,0.624,19642730.0,...,5.046012,113,0.346626,0.001032,0.001032,0.001034,0.001031,0.426718,0.001033,0.568119
7,0.00125,0.315,0.715,8e-06,0.0942,-8.072,0.0362,155.925,0.497,54270.0,...,4.754098,76,0.311475,0.002512,0.4039,0.244769,0.002516,0.34127,0.002514,0.00252
8,0.949,0.532,0.0744,1.2e-05,0.106,-16.092,0.0355,117.131,0.125,53335640.0,...,5.192982,118,0.517544,0.001449,0.226179,0.443669,0.00145,0.001451,0.175432,0.15037
9,0.0339,0.877,0.534,1.7e-05,0.0441,-6.178,0.15,108.17,0.89,87851150.0,...,5.069337,282,0.434515,0.492625,0.00049,0.242101,0.027884,0.000489,0.235922,0.000489


In [3]:
# print all of the column names and types
pd.set_option('display.max_rows', None)
df.dtypes

acousticness                      float64
danceability                      float64
energy                            float64
instrumentalness                  float64
liveness                          float64
loudness                          float64
speechiness                       float64
tempo                             float64
valence                           float64
view_count                        float64
chart_year                          int64
type_Group                          int64
type_Person                         int64
country_CA                          int64
country_GB                          int64
country_Other                       int64
country_US                          int64
key_C                               int64
key_C_Sharp                         int64
key_D                               int64
key_D_Sharp                         int64
key_E                               int64
key_F                               int64
key_F_Sharp                       

In [4]:
# set the target and predictor vectors
X = df.drop("view_count", axis = 1)
y = df["view_count"]

In [5]:
# create a log transformed target
y_log = np.log1p(y)

In [6]:
# create the train-test split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

In [7]:
# create a second train-test split using the log_transformed target
X_train, X_test, y_log_train, y_log_test = train_test_split(X, y_log, test_size = 0.3, random_state = 1234)

In [8]:
# scaling our numeric predictors (non encoded) and our target for SVM Regression
## must first get two lists: one of the float type and one with a select few integer type
float_cols = X_train.select_dtypes(include = "float").columns.tolist()
extra_int_cols = ["chart_year", "begin_year", "word_count", "unique_words"]
train_cols_to_scale = float_cols + extra_int_cols

## scale the numeric predictors in the train and test sets using StandardScaler from scikitlearn.preprocessing
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[train_cols_to_scale] = scaler.fit_transform(X_train[train_cols_to_scale])
X_test_scaled[train_cols_to_scale] = scaler.transform(X_test[train_cols_to_scale])

## scale the target (both train and test) using the same method but a new scaler
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel()


In [9]:
# need to add the scaled columns back to the full data frames
## create a list of other columns (columns that we did not scale)
other_columns = X_train.columns.difference(train_cols_to_scale)

## combine the scaled and unscaled columns
X_train_full = pd.concat([X_train_scaled[train_cols_to_scale], X_train[other_columns]], axis = 1)
X_test_full = pd.concat([X_test_scaled[train_cols_to_scale], X_test[other_columns]], axis = 1)

## set the columns to the original order
X_train_full = X_train_full[X_train.columns]
X_test_full = X_test_full[X_test.columns]

X_train_full.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,chart_year,...,avg_word_len,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic
4740,0.239603,0.726038,0.08717,-0.154876,-0.211736,0.49477,1.899535,-1.089475,-0.810134,-0.909986,...,-0.060709,0.604413,-0.137735,-0.419352,1.20964,-0.537217,0.07418,0.484062,-0.381746,-0.439759
2391,0.911247,0.211797,0.46768,-0.033053,-0.877748,-0.236437,-0.656976,0.374362,1.391823,-1.019643,...,-0.663361,-0.726035,-0.612656,-0.712224,-0.051157,-0.294563,0.338329,1.377419,-0.375753,-0.435706
2542,1.058456,0.398794,0.773241,-0.154876,-0.137735,1.349241,-0.457061,-0.906449,1.488709,-0.800329,...,1.078526,1.186485,1.281888,1.553928,-0.539756,-0.758628,-0.52459,0.05074,0.041396,-0.440264
990,-0.756822,-0.102091,1.026914,-0.154876,-0.861467,0.402621,-0.604463,-0.42743,-0.382955,-0.032729,...,-0.240026,-0.654761,-0.113978,-0.711467,-0.535153,-0.02446,0.773071,0.893584,0.291093,-0.434207
5331,-0.785804,0.565755,-1.250381,-0.154629,-0.041533,-1.533303,0.010942,1.003649,-1.717341,1.1735,...,-0.121491,1.210243,1.920919,2.471399,-0.539432,-0.915189,-0.517392,-0.747037,-0.219176,-0.439843


In [10]:
X_test_full.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,chart_year,...,avg_word_len,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic
2690,-0.816253,-0.409299,1.309414,-0.144372,0.16567,1.064816,-0.437714,-1.056751,0.797294,-1.1293,...,0.515809,-0.559729,-0.323651,-0.643706,-0.068047,-0.230919,0.747719,-0.572628,-0.379811,2.10647
5166,-0.774671,-0.142161,0.75018,-0.154862,0.291472,0.178033,-0.67448,-0.87287,0.339287,0.076928,...,-0.557078,-0.595366,-0.203729,-0.71211,-0.536125,0.194342,1.768332,-0.876379,-0.375459,1.429035
3654,-0.805539,-0.115447,1.80523,-0.154711,1.505093,0.409402,-0.515101,0.700039,-0.079085,-1.677586,...,0.069657,-0.868583,-0.01507,-0.710985,-0.534461,1.48894,-0.030038,-0.374874,0.707258,-0.433299
885,-0.719559,0.251868,0.225537,-0.154876,-0.374539,0.636783,-0.533526,-0.940335,1.008682,0.515557,...,-0.360235,-0.678519,0.73439,-0.711962,-0.535886,-0.912598,0.834006,1.022486,0.981514,0.41421
3197,-0.72692,1.186851,-0.414412,-0.150739,1.349691,0.068332,0.010942,-0.077978,-1.360624,1.283157,...,0.570427,0.663808,0.351951,2.049594,-0.539457,-0.91521,0.489832,-0.879028,-0.381917,-0.439881


## 2 - Modeling with Support Vector Regression
### First Model

In [11]:
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "epsilon", and "gamma")
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
}

## set up the scoring using mse
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

## initialize the grid
grid = GridSearchCV(
    SVR(kernel = 'rbf'), 
    param_grid,
    cv = 3, 
    scoring = mse_scorer,
    n_jobs = -1
)

## fit the model
grid.fit(X_train_full, y_train_scaled)

print("Best parameters: ", grid.best_params_)

Best parameters:  {'C': 10, 'epsilon': 0.5, 'gamma': 0.01}


can see that my hyper parameters epsilon and gamma both hit their boundaries, so in the next iteration need to update them

In [12]:
# now going to retrain the model with the best hyper parameters
best_svr = grid.best_estimator_
y_pred_best = best_svr.predict(X_test_full)

# unscaling y
y_pred_unscaled = y_scaler.inverse_transform(
    y_pred_best.reshape(-1, 1)
).ravel()

y_test_unscaled = y_scaler.inverse_transform(
    y_test_scaled.reshape(-1, 1)
).ravel()

mse_test = mean_squared_error(y_test_unscaled, y_pred_unscaled)
print("Test Set MSE:", mse_test)
r2_test = r2_score(y_test_unscaled, y_pred_unscaled)
print("Final R²:", r2_test)

Test Set MSE: 1.781352684374207e+17
Final R²: 0.07580628880148021


### Second Model

In [13]:
param_grid2 = {
    "C": [1, 10, 25, 50, 100, 250],
    "epsilon": [0.01, 0.05, 0.1, 0.2],
    "gamma": ["scale", 0.001, 0.01, 0.05]
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

grid2 = GridSearchCV(
    SVR(kernel="rbf"),
    param_grid2,
    cv=3,
    scoring=mse_scorer,
    n_jobs=-1
)

grid2.fit(X_train_full, y_train_scaled)

print("Best params:", grid2.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best params: {'C': 1, 'epsilon': 0.2, 'gamma': 0.05}


In [14]:
# now going to retrain the model with the best hyper parameters
best_svr2 = grid2.best_estimator_
y_pred_best2 = best_svr2.predict(X_test_full)

# unscaling y
y_pred_unscaled2 = y_scaler.inverse_transform(
    y_pred_best2.reshape(-1, 1)
).ravel()

# test parameters
mse_test = mean_squared_error(y_test_unscaled, y_pred_unscaled2)
print("Test Set MSE:", mse_test)
r2_test = r2_score(y_test_unscaled, y_pred_unscaled2)
print("Final R²:", r2_test)

Test Set MSE: 1.7848372392831366e+17
Final R²: 0.07399844706334058


### Third Model

In [15]:
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "epsilon", and "gamma")
param_grid3 = {
    "C": [0.01, 0.1, 1, 10, 50, 100],
    "epsilon": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "gamma": [0.001, 0.01, 0.05, 0.1, 0.5, "scale"]
}

## set up the scoring using mse
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

## initialize the grid
grid3 = GridSearchCV(
    SVR(kernel = 'rbf'), 
    param_grid3,
    cv = 3, 
    scoring = mse_scorer,
    n_jobs = -1
)

## fit the model
grid3.fit(X_train_full, y_train_scaled)
print("Best parameters: ", grid3.best_params_)

Best parameters:  {'C': 10, 'epsilon': 0.4, 'gamma': 0.01}


In [16]:
# now going to retrain the model with the best hyper parameters
best_svr3 = grid3.best_estimator_
y_pred_best3 = best_svr3.predict(X_test_full)

# unscaling y
y_pred_unscaled3 = y_scaler.inverse_transform(
    y_pred_best3.reshape(-1, 1)
).ravel()

# test parameters
mse_test = mean_squared_error(y_test_unscaled, y_pred_unscaled3)
print("Test Set MSE:", mse_test)
r2_test = r2_score(y_test_unscaled, y_pred_unscaled3)
print("Final R²:", r2_test)

Test Set MSE: 1.7793448209691824e+17
Final R²: 0.07684799982712198


### Fourth Model

In [17]:
# will use the log_transformed target in this model instead of the scaled y
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "epsilon", and "gamma")
param_grid4 = {
    "C": [0.01, 0.1, 1, 10, 50, 100],
    "epsilon": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "gamma": [0.001, 0.01, 0.05, 0.1, 0.5, "scale"]
}

## set up the scoring using mse
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

## initialize the grid
grid4 = GridSearchCV(
    SVR(kernel = 'rbf'), 
    param_grid4,
    cv = 3, 
    scoring = mse_scorer,
    n_jobs = -1
)

## fit the model
grid4.fit(X_train_full, y_log_train)
print("Best parameters: ", grid4.best_params_)

Best parameters:  {'C': 100, 'epsilon': 0.9, 'gamma': 0.001}


In [18]:
# now going to retrain the model with the best hyper parameters
best_svr4 = grid4.best_estimator_
y_pred_best4 = best_svr4.predict(X_test_full)

# test parameters
mse_test = mean_squared_error(y_log_test, y_pred_best4)
print("Test Set MSE:", mse_test)
r2_test = r2_score(y_log_test, y_pred_best4)
print("Final R²:", r2_test)

Test Set MSE: 3.853371320387035
Final R²: 0.20320734049466727


### Model 5

In [19]:
# will use the log_transformed target in this model instead of the scaled y
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "epsilon", and "gamma")
param_grid5 = {
    "C": [100, 150, 200, 250, 300, 350, 400, 450, 500],
    "epsilon": [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99],
    "gamma": [0.0001, 0.0003, 0.0005, 0.0007, 0.0009, 0.001, "scale"]
}

## set up the scoring using mse
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

## initialize the grid
grid5 = GridSearchCV(
    SVR(kernel = 'rbf'), 
    param_grid5,
    cv = 3, 
    scoring = mse_scorer,
    n_jobs = -1
)

## fit the model
grid5.fit(X_train_full, y_log_train)
print("Best parameters: ", grid5.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters:  {'C': 150, 'epsilon': 0.91, 'gamma': 0.001}


In [20]:
# now going to retrain the model with the best hyper parameters
best_svr5 = grid5.best_estimator_
y_pred_best5 = best_svr5.predict(X_test_full)

# test parameters
mse_test = mean_squared_error(y_log_test, y_pred_best5)
print("Test Set MSE:", mse_test)
r2_test = r2_score(y_log_test, y_pred_best5)
print("Final R²:", r2_test)

Test Set MSE: 3.867250698399612
Final R²: 0.20033738958689684


## 3 - Modeling with Support Vector Classification

Need to create a new target variable for this call viral (which is basically if a music video has gotten over 100 million views)

In [22]:
# create viral variable
df['viral'] = np.where(df['view_count'] > 100000000, 'viral', 'not')
df.head(10)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,view_count,...,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic,viral
0,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,0.907,1118930000.0,...,98,0.30721,0.001245,0.001247,0.001247,0.362314,0.385711,0.001244,0.246992,viral
1,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,0.495,220560700.0,...,428,0.450053,0.192193,0.030384,0.000396,0.01585,0.760386,0.000396,0.000396,viral
2,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,0.275,87564090.0,...,141,0.290722,0.000765,0.418962,0.230767,0.000766,0.347208,0.000765,0.000766,not
3,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,0.796,10499470.0,...,132,0.300683,0.0009,0.000902,0.000901,0.000906,0.99459,0.0009,0.000901,not
4,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,0.709,21090600.0,...,117,0.365625,0.001012,0.324886,0.046997,0.624069,0.001012,0.001011,0.001014,not
5,0.00483,0.395,0.843,0.0,0.0404,-4.476,0.0374,112.423,0.481,56592740.0,...,120,0.38835,0.001185,0.329814,0.001187,0.001189,0.188675,0.231365,0.246584,not
6,0.0129,0.599,0.543,0.00204,0.291,-9.226,0.0302,91.105,0.624,19642730.0,...,113,0.346626,0.001032,0.001032,0.001034,0.001031,0.426718,0.001033,0.568119,not
7,0.00125,0.315,0.715,8e-06,0.0942,-8.072,0.0362,155.925,0.497,54270.0,...,76,0.311475,0.002512,0.4039,0.244769,0.002516,0.34127,0.002514,0.00252,not
8,0.949,0.532,0.0744,1.2e-05,0.106,-16.092,0.0355,117.131,0.125,53335640.0,...,118,0.517544,0.001449,0.226179,0.443669,0.00145,0.001451,0.175432,0.15037,not
9,0.0339,0.877,0.534,1.7e-05,0.0441,-6.178,0.15,108.17,0.89,87851150.0,...,282,0.434515,0.492625,0.00049,0.242101,0.027884,0.000489,0.235922,0.000489,not


In [23]:
# create the new predictions and target variable arrays
X2 = df.drop(["view_count", "viral"], axis = 1)
y2 = df["viral"]

In [25]:
# create the train-test split (70-30)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 1234)

In [26]:
# scaling our numeric predictors (non encoded) and our target for SVM Regression
## scale the numeric predictors in the train and test sets using StandardScaler from scikitlearn.preprocessing
scaler = StandardScaler()
X2_train_scaled = X2_train.copy()
X2_test_scaled = X2_test.copy()
X2_train_scaled[train_cols_to_scale] = scaler.fit_transform(X2_train[train_cols_to_scale])
X2_test_scaled[train_cols_to_scale] = scaler.transform(X2_test[train_cols_to_scale])

In [27]:
# need to add the scaled columns back to the full data frames
## create a list of other columns (columns that we did not scale)
other_columns2 = X2_train.columns.difference(train_cols_to_scale)

## combine the scaled and unscaled columns
X2_train_full = pd.concat([X2_train_scaled[train_cols_to_scale], X2_train[other_columns2]], axis = 1)
X2_test_full = pd.concat([X2_test_scaled[train_cols_to_scale], X2_test[other_columns2]], axis = 1)

## set the columns to the original order
X2_train_full = X2_train_full[X2_train.columns]
X2_test_full = X2_test_full[X2_test.columns]

X2_train_full.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,chart_year,...,avg_word_len,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic
4740,0.239603,0.726038,0.08717,-0.154876,-0.211736,0.49477,1.899535,-1.089475,-0.810134,-0.909986,...,-0.060709,0.604413,-0.137735,-0.419352,1.20964,-0.537217,0.07418,0.484062,-0.381746,-0.439759
2391,0.911247,0.211797,0.46768,-0.033053,-0.877748,-0.236437,-0.656976,0.374362,1.391823,-1.019643,...,-0.663361,-0.726035,-0.612656,-0.712224,-0.051157,-0.294563,0.338329,1.377419,-0.375753,-0.435706
2542,1.058456,0.398794,0.773241,-0.154876,-0.137735,1.349241,-0.457061,-0.906449,1.488709,-0.800329,...,1.078526,1.186485,1.281888,1.553928,-0.539756,-0.758628,-0.52459,0.05074,0.041396,-0.440264
990,-0.756822,-0.102091,1.026914,-0.154876,-0.861467,0.402621,-0.604463,-0.42743,-0.382955,-0.032729,...,-0.240026,-0.654761,-0.113978,-0.711467,-0.535153,-0.02446,0.773071,0.893584,0.291093,-0.434207
5331,-0.785804,0.565755,-1.250381,-0.154629,-0.041533,-1.533303,0.010942,1.003649,-1.717341,1.1735,...,-0.121491,1.210243,1.920919,2.471399,-0.539432,-0.915189,-0.517392,-0.747037,-0.219176,-0.439843


#### First SVM Classification Model

In [28]:
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "gamma", and "kernel")
param_grid6 = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1, 1, 10],
    'kernel': ['rbf', 'linear']
}

## conduct the cross validation
grid6 = GridSearchCV(
    SVC(), 
    param_grid6, 
    refit=True, 
    cv=3)
grid6.fit(X2_train_full, y2_train)

print("Best parameters:", grid6.best_params_)

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


In [32]:
y_pred_grid = grid6.predict(X2_test_full)
print("Accuracy with grid search:", accuracy_score(y2_test, y_pred_grid))
print(confusion_matrix(y2_test, y_pred_grid))  
print(classification_report(y2_test, y_pred_grid)) 

Accuracy with grid search: 0.6948733786287832
[[965  80]
 [414 160]]
              precision    recall  f1-score   support

         not       0.70      0.92      0.80      1045
       viral       0.67      0.28      0.39       574

    accuracy                           0.69      1619
   macro avg       0.68      0.60      0.59      1619
weighted avg       0.69      0.69      0.65      1619



#### Second SVM Classification Model

In [33]:
## setting up a grid search for the 3 hyperparameters involved in SVM ("C", "gamma", and "kernel")
param_grid7 = {
    'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'gamma': [0.01, 0.03, 0.05, 0.07, 0.09, 0.1],
    'kernel': ['rbf', 'linear']
}

## conduct the cross validation
grid7 = GridSearchCV(
    SVC(), 
    param_grid7, 
    refit=True, 
    cv=3)
grid7.fit(X2_train_full, y2_train)

print("Best parameters:", grid7.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}


In [40]:
svm_final = SVC(
    C = 5,
    gamma = 0.01,
    kernel='rbf', 
    probability=True)
svm_final.fit(X2_train_full, y2_train)

In [44]:
y_pred_grid2 = grid7.predict(X2_test_full)
y_prob = svm_final.predict_proba(X2_test_full)[:,1]
print("Accuracy with grid search:", accuracy_score(y2_test, y_pred_grid2))
print(roc_auc_score(y2_test, y_prob))
print(confusion_matrix(y2_test, y_pred_grid2))  
print(classification_report(y2_test, y_pred_grid2)) 

Accuracy with grid search: 0.6973440395305744
0.7020997615991198
[[936 109]
 [381 193]]
              precision    recall  f1-score   support

         not       0.71      0.90      0.79      1045
       viral       0.64      0.34      0.44       574

    accuracy                           0.70      1619
   macro avg       0.67      0.62      0.62      1619
weighted avg       0.69      0.70      0.67      1619

