# SVM Classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# Load the aggregated training data
train_df = pd.read_csv('drive/MyDrive/CS 249 Project/Data/application_train.csv')

Mounted at /content/drive


## Encode categorical features


In [2]:
feat_missing_vals = train_df.isna().sum()
feat_missing_vals[feat_missing_vals > 0]

AMT_ANNUITY                       12
AMT_GOODS_PRICE                  278
NAME_TYPE_SUITE                 1292
OWN_CAR_AGE                   202929
OCCUPATION_TYPE                96391
                               ...  
AMT_REQ_CREDIT_BUREAU_DAY      41519
AMT_REQ_CREDIT_BUREAU_WEEK     41519
AMT_REQ_CREDIT_BUREAU_MON      41519
AMT_REQ_CREDIT_BUREAU_QRT      41519
AMT_REQ_CREDIT_BUREAU_YEAR     41519
Length: 67, dtype: int64

In [3]:
train_df.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [4]:
categorical_features = train_df.select_dtypes('object')
categorical_features.nunique()

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder

# Label encode the features with 2 categories
label_encoder = LabelEncoder()

for feat in categorical_features:
  if len(train_df[feat].unique()) <= 2:
    train_df[feat] = label_encoder.fit_transform(train_df[feat])
train_df.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [6]:
# One-hot encode features with more than 2 categories
train_df = pd.get_dummies(train_df, drop_first=True)
train_df.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       ...
       'FONDKAPREMONT_MODE_reg oper spec account',
       'HOUSETYPE_MODE_specific housing', 'HOUSETYPE_MODE_terraced house',
       'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic',
       'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel',
       'WALLSMATERIAL_MODE_Stone, brick', 'WALLSMATERIAL_MODE_Wooden',
       'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=230)

In [7]:
# Create an anomalous flag column
max_days_employed = train_df['DAYS_EMPLOYED'].max()
train_df['DAYS_EMPLOYED_ANOM'] = train_df["DAYS_EMPLOYED"] == max_days_employed

## Impute missing features

In [8]:
missing_values = train_df.isna().sum()
missing_values_percent = missing_values*100/len(train_df)
miss_df = pd.concat(
    [missing_values.rename('missing_val'), 
     missing_values_percent.rename('missing_val_percent')],
    axis=1
)
miss_df.sort_values(by='missing_val_percent', ascending=False).head(10)

Unnamed: 0,missing_val,missing_val_percent
COMMONAREA_AVG,214865,69.872297
COMMONAREA_MEDI,214865,69.872297
COMMONAREA_MODE,214865,69.872297
NONLIVINGAPARTMENTS_MEDI,213514,69.432963
NONLIVINGAPARTMENTS_MODE,213514,69.432963
NONLIVINGAPARTMENTS_AVG,213514,69.432963
LIVINGAPARTMENTS_MEDI,210199,68.354953
LIVINGAPARTMENTS_AVG,210199,68.354953
LIVINGAPARTMENTS_MODE,210199,68.354953
FLOORSMIN_MEDI,208642,67.84863


In [9]:
# Keep these features
features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']

print(f"Training data shape before dropping columns:", train_df.shape)

# get columns missing >= 48% of the information
missing_48pct = miss_df.loc[miss_df['missing_val_percent'] >= 48]
missing_48pct_rows = missing_48pct.index.values

print(f"Number of columns missing 48% or more of the data:", len(missing_48pct_rows))

for row in missing_48pct_rows:
  if row not in features:
    train_df = train_df.drop(row, axis=1)

print(f"Training data shape after dropping columns:", train_df.shape)

Training data shape before dropping columns: (307511, 231)
Number of columns missing 48% or more of the data: 45
Training data shape after dropping columns: (307511, 187)


In [10]:
missing_values = train_df.isna().sum()
missing_values_percent = missing_values*100/len(train_df)
miss_df = pd.concat(
    [missing_values.rename('missing_val'), 
     missing_values_percent.rename('missing_val_percent')],
    axis=1
)
print(f"Missing values data shape:", missing_values.shape)
miss_df.sort_values(by='missing_val_percent', ascending=False).head(20)

Missing values data shape: (187,)


Unnamed: 0,missing_val,missing_val_percent
EXT_SOURCE_1,173378,56.381073
EXT_SOURCE_3,60965,19.825307
AMT_REQ_CREDIT_BUREAU_HOUR,41519,13.501631
AMT_REQ_CREDIT_BUREAU_DAY,41519,13.501631
AMT_REQ_CREDIT_BUREAU_WEEK,41519,13.501631
AMT_REQ_CREDIT_BUREAU_MON,41519,13.501631
AMT_REQ_CREDIT_BUREAU_QRT,41519,13.501631
AMT_REQ_CREDIT_BUREAU_YEAR,41519,13.501631
DEF_30_CNT_SOCIAL_CIRCLE,1021,0.332021
DEF_60_CNT_SOCIAL_CIRCLE,1021,0.332021


In [11]:
# Impute missing data by filling in NaNs with the median of the column
train_df = train_df.fillna(train_df.median())

# check for NaNs in dataframe (should be empty after data cleanup)
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,...,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_ANOM


## Split train, validation, and test sets

In [12]:
loan_ids = train_df['SK_ID_CURR']
y = train_df['TARGET']
X = train_df.drop(columns=['SK_ID_CURR', 'TARGET'])

# Extract feature names
feature_names = X.columns

# Convert to np.array
X = np.array(X)
y = np.array(y)

# Split train set into train and test
from sklearn.model_selection import train_test_split
import gc

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=0
)

## Train an SVM model with Variance Threshold




In [14]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold

# Create pipeline with scaler, PCA, and KNN
pipeline = Pipeline(
    [('sel', VarianceThreshold(1.5)),
     ('clf', SVC(probability=True, kernel='poly', gamma='auto'))]
)

# Grid search parameters
parameters = {
    'clf__degree': [1,4],
}

# Stratified KFold
strat_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Run grid search
gs = GridSearchCV(
    pipeline,
    parameters,
    scoring='roc_auc',
    cv=strat_kf,
    n_jobs=-1,
    verbose=10
)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0279s.) Setting batch_size=2.
exception calling callback for <Future at 0x7f49dbde14e0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 366, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 799, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 866, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 784, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/li

TerminatedWorkerError: ignored

In [None]:
# Print best score
print(f"Best score: {gs.best_score_}")
print(f"Best parameter set:")
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
  print(f"\t{param_name}: {best_parameters[param_name]}")

## Make predictions

In [None]:
from sklearn.metrics import roc_curve, auc

y_pred_proba = gs.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(
    y_test, y_pred_proba, pos_label=1
)
auc_test = auc(fpr, tpr)
print(f"ROC AUC score: {auc_test}")

In [None]:
# Plot ROC AUC curve
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')