# Current Population Survey Support Vector Machine Classification
Predict whether an individual is married. Then predict whether an individuals family income is $50,000 or greater.

In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split, cross_val_score, KFold
%matplotlib inline

In [2]:
# Import dataset
data = pd.read_csv("../datasets/CPS2016_UPDATE.csv")

# We are going to use the entire dataset for this model
# to test on a smaller sample uncomment below
# data = data.sample(n=1000)

# Predict if Individual is Married

In [3]:
X = data[['age', 'educ', 'num_child',
              'num_in_house', 'weekly_hrs', 'fam_income']]
y = data['ismarried']

# Apply 80/20 training/testing split
# Results may vary each time the data is split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

## Linear SVM

In [4]:
svclassifier = SVC(kernel='linear', gamma='scale')  
svclassifier.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [5]:
cvscore = cross_val_score(svclassifier, X, y, cv=KFold(shuffle=True, n_splits=5))
cvscore.mean()

0.7583297971437241

## Gaussin Kernal SVM

In [6]:
svclassifier = SVC(kernel='rbf', gamma='scale')  
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
cvscore = cross_val_score(svclassifier, X, y, cv=KFold(shuffle=True, n_splits=5))
cvscore.mean()

0.8066349131607427

This score is slightly lower than the score achieved by Random Forest Decision Tree Classification for predicting if an individual is married, which was around .83

# Predict if Family Income >50,000

In [8]:
X = data[['age', 'educ', 'num_child',
              'num_in_house', 'weekly_hrs', 'ismarried']]
y = data['faminc_50']

# Apply 80/20 training/testing split
# Results may vary each time the data is split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

## Linear SVM

In [9]:
svclassifier = SVC(kernel='linear', gamma='scale')  
svclassifier.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
cvscore = cross_val_score(svclassifier, X, y, cv=KFold(shuffle=True, n_splits=5))
cvscore.mean()

0.7310734427096748

## Gaussin Kernal SVM

In [11]:
svclassifier = SVC(kernel='rbf', gamma='scale')  
svclassifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
cvscore = cross_val_score(svclassifier, X, y, cv=KFold(shuffle=True, n_splits=5))
cvscore.mean()

0.7393443790635847

This score is slightly lower than the score achieved by Random Forest Decision Tree Classification for predicting if an individual's family income is over $50,000, which was around .745