# Support Vector Machine
This notebook demonstrates the use of Support Vector Machine (SVM) for classifying the UCI Census Income Dataset. The dataset contains demographic information, and the goal is to predict whether a person’s income is above or below $50K based on their features (age, education, occupation, etc.).
<br/><br/>
Link to dataset: https://archive.ics.uci.edu/dataset/20/census+income
<br/>
Link to github repository: https://github.com/faadeola/support-vector-machines

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [2]:
# Define column names based on the attribute information
column_names = [
    'age', 'workclass', 'final-weight', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Load training data
train_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

train_df = pd.read_csv(train_data_url, header=None, names=column_names, na_values=' ?')

# Load test data
test_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

test_df = pd.read_csv(test_data_url, header=None, names=column_names, skiprows=1, na_values=' ?')

In [3]:
# Join the training and test data into a single data
df = pd.concat([train_df,test_df],axis=0)

In [4]:
# Show first 5 rows in the dataset
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# View dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   final-weight    48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  47985 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [6]:
''' Since dataset has null values in some columns, I have considered dropping those value '''

df = df.dropna(how='any')

In [7]:
# check for duplicate values
num_of_duplicates = df.duplicated().sum()
print(f'There are {num_of_duplicates} duplicates')

There are 28 duplicates


In [8]:
# drop duplicates
df = df.drop_duplicates(keep='first', ignore_index=True)

In [9]:
# print out the dataframe
df

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45189,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
45190,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
45191,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
45192,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [10]:
# View dataset statistic info
df.describe()

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loss,hours-per-week
count,45194.0,45194.0,45194.0,45194.0,45194.0,45194.0
mean,38.551755,189737.9,10.119109,1102.112736,88.650308,40.941209
std,13.215806,105640.7,2.552057,7508.705003,405.075515,12.006449
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,117392.2,9.0,0.0,0.0,40.0
50%,37.0,178312.0,10.0,0.0,0.0,40.0
75%,47.0,237903.0,13.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [11]:
# Print out all unique values in the target column
df['income'].unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [12]:
# clean up target values to just <=50k and >50k by removing space
df['income'] = df['income'].str.strip()

In [13]:
# count the number of each unique values
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
<=50K,22633
<=50K.,11355
>50K,7506
>50K.,3700


In [14]:
# change the values in income to be numeric
df['income'] = df['income'].replace({
    '<=50K':0,
    '>50K':1
})

# Convert income column to numeric and drop NAN values returned
df['income'] = pd.to_numeric(df['income'], errors='coerce')
df.dropna(subset=['income'], inplace=True)

# convert to int type
df['income'] = df['income'].astype(int)

In [15]:
# recount the number of each unique values
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,22633
1,7506


In [16]:
# seperate columns
feature = df.drop('income', axis=1)
target = df['income']

# select categorical and numerical columns
num_columns = feature.select_dtypes(include=['int64','float64']).columns
cat_columns = feature.select_dtypes(include='object').columns

In [17]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

In [18]:
# check the shape of the data splits
print(f'X_train has {X_train.shape[0]} rows and {X_train.shape[1]} columns\n')
print(f'X_test has {X_test.shape[0]} rows and {X_test.shape[1]} columns\n')
print(f'y_train has {y_train.shape[0]} rows\n')
print(f'y_test has {y_test.shape[0]} rows\n')

X_train has 24111 rows and 14 columns

X_test has 6028 rows and 14 columns

y_train has 24111 rows

y_test has 6028 rows



In [19]:
# Preprocess data for machine learning
preprocessor = ColumnTransformer([
    ('one_hot',OneHotEncoder(handle_unknown='ignore'),cat_columns),
    ('scaler',StandardScaler(),num_columns)],
    remainder = 'passthrough'
)

In [20]:
# Create pipleine for data preprocessing and model selection
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('model',SVC())
])


# Create parameter grid for the gridsearchcv
param_grid = {
    'model__kernel':['linear','rbf', 'poly'],
    'model__class_weight':[None,'balanced'],
    'model__C': [5, 10],
    'pca__n_components': [5,10,15]
}

In [21]:
# Set up model to use GrdisearchCV
svc_model = GridSearchCV(pipeline, param_grid, cv=5)

In [22]:
# Fit training data into model
svc_model.fit(X_train,y_train)

In [23]:
# make prediction using x_test
y_pred = svc_model.predict(X_test)

In [25]:
# find the best estimator used
svc_model.best_estimator_

In [26]:
# show the best parameter combination for the best result
svc_model.best_params_

{'model__C': 5,
 'model__class_weight': None,
 'model__kernel': 'rbf',
 'pca__n_components': 15}

In [27]:
# show the best average mean score
svc_model.best_score_

np.float64(0.8479116252458561)

In [28]:
# get result from gridsearch
grid_result = svc_model.cv_results_

# create dataframe to hold the results
results = pd.DataFrame(grid_result)

# sort result from highest to lowest using the mean test score
results = results.sort_values(by='mean_test_score', ascending=False)

# display only key columns
results[['param_model__kernel','params', 'mean_test_score']]

Unnamed: 0,param_model__kernel,params,mean_test_score
5,rbf,"{'model__C': 5, 'model__class_weight': None, '...",0.847912
23,rbf,"{'model__C': 10, 'model__class_weight': None, ...",0.847497
22,rbf,"{'model__C': 10, 'model__class_weight': None, ...",0.847041
4,rbf,"{'model__C': 5, 'model__class_weight': None, '...",0.846377
8,poly,"{'model__C': 5, 'model__class_weight': None, '...",0.843142
26,poly,"{'model__C': 10, 'model__class_weight': None, ...",0.842976
25,poly,"{'model__C': 10, 'model__class_weight': None, ...",0.84281
7,poly,"{'model__C': 5, 'model__class_weight': None, '...",0.842561
20,linear,"{'model__C': 10, 'model__class_weight': None, ...",0.840861
2,linear,"{'model__C': 5, 'model__class_weight': None, '...",0.840861


In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# print the accuracy score of the model
print(accuracy_score(y_test,y_pred))

0.8483742534837425


In [30]:
# print confusion matrix showing the True Negative, False Positive, False Negative and True positive
print(confusion_matrix(y_test,y_pred))

[[4234  251]
 [ 663  880]]


In [32]:
# print classification report of the model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      4485
           1       0.78      0.57      0.66      1543

    accuracy                           0.85      6028
   macro avg       0.82      0.76      0.78      6028
weighted avg       0.84      0.85      0.84      6028

