In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#get data 
df = pd.read_csv("adult.csv")

#remove redundant columns or unnecessary columns
df = df.drop(['education','fnlwgt'],axis=1)

#replace ? with nan for easier removal
df['workclass'] = df['workclass'].replace("?",np.nan)
df['occupation'] = df['occupation'].replace("?",np.nan)
df['native.country'] = df['native.country'].replace("?",np.nan)

#replace nan with mode
df['workclass']=df['workclass'].fillna(df['workclass'].mode()[0])
df['occupation']=df['occupation'].fillna(df['occupation'].mode()[0])
df['native.country']=df['native.country'].fillna(df['native.country'].mode()[0])

#remove duplicates
df = df.drop_duplicates(keep='first')

#change categorical values into numeric (ordinal where necessary)
df['workclass'].replace(['Never-worked','Without-pay','Private','Local-gov','State-gov','Federal-gov','Self-emp-not-inc','Self-emp-inc'],[0,1,2,3,4,5,6,7], inplace=True)
df['marital.status'].replace(['Widowed','Never-married','Divorced','Separated','Married-spouse-absent','Married-AF-spouse','Married-civ-spouse'],[0,1,2,3,4,5,6], inplace=True)
df['relationship'].replace(['Not-in-family','Unmarried','Own-child','Other-relative','Wife','Husband'],[0,1,2,3,4,5], inplace=True)
df['income'].replace(['<=50K','>50K'],[0,1], inplace=True)

#non ordinal
df['occupation'] = df['occupation'].astype('category')
df['occupation'] = (df['occupation'].cat.codes).astype(np.int64)
df['race'] = df['race'].astype('category')
df['race'] = (df['race'].cat.codes).astype(np.int64)
df['sex'] = df['sex'].astype('category')
df['sex'] = (df['sex'].cat.codes).astype(np.int64)
df['native.country'] = df['native.country'].astype('category')
df['native.country'] = (df['native.country'].cat.codes).astype(np.int64)


#statified split into test/train
train, test = train_test_split(df,stratify=df['income'],test_size=0.25)


#write to file
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)


In [2]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC, SVC

In [3]:
X_dev = train.drop('income', axis=1)
y_dev = train['income']
X_test = test.drop('income', axis=1)
y_test = test['income']

In [4]:
scaler = MinMaxScaler()
X_dev = scaler.fit_transform(X_dev)
X_test = scaler.transform(X_test)

In [5]:
t_start_linear = time.time()
svm_linear = SVC(kernel="linear")
svm_linear.fit(X_dev, y_dev.ravel(order='C'))
t_end_linear = time.time()
p_start_linear = time.time()
pred_train1 = svm_linear.predict(X_dev)
pred_test1 = svm_linear.predict(X_test)
p_end_linear = time.time()

t_start_rbf = time.time()
svm_rbf = SVC(kernel="rbf")
svm_rbf.fit(X_dev, y_dev.ravel(order='C'))
t_end_rbf = time.time()
p_start_rbf = time.time()
pred_train2 = svm_rbf.predict(X_dev)
pred_test2 = svm_rbf.predict(X_test)
p_end_rbf = time.time()

score1 = svm_linear.score(X_test, y_test.ravel(order='C'))
score2 = svm_rbf.score(X_test, y_test.ravel(order='C'))

print("Linear kernel SVM test accuracy = ", score1)
print("Rbf kernel SVM test accuracy = ", score2)

print(svm_linear.support_vectors_.shape)
print(svm_rbf.support_vectors_.shape)

print(f"Linear kernel SVM train time = {t_end_linear - t_start_linear}")
print(f"Rbf kernel SVM train time = {t_end_rbf - t_start_rbf}")

print(f"Linear kernel SVM prediction time = {p_end_linear - p_start_linear}")
print(f"Rbf kernel SVM prediction time = {p_end_rbf - p_start_rbf}")

Linear kernel SVM test accuracy =  0.8363937594919232
Rbf kernel SVM test accuracy =  0.8381885958856827
(8580, 12)
(8242, 12)
Linear kernel SVM train time = 18.770982265472412
Rbf kernel SVM train time = 12.613884925842285
Linear kernel SVM prediction time = 11.084616661071777
Rbf kernel SVM prediction time = 16.245892763137817


Though Rbf kernel performs better, the difference is minimal and linear kernel takes lesser time to train and predict.

In [6]:
from sklearn.model_selection import KFold, GridSearchCV

kf = KFold()

params = {"C": np.logspace(-2, 2, 4), 'kernel': ['rbf']}

clf = GridSearchCV(SVC(), params, scoring='accuracy')
clf.fit(X_dev, y_dev.ravel(order='C'))

print("Best score Linear kernel SVM", clf.best_score_)
print("Best parameters Linear kernel SVM", clf.best_params_)

Best score Linear kernel SVM 0.8430984194028927
Best parameters Linear kernel SVM {'C': 4.6415888336127775, 'kernel': 'rbf'}
