A Support Vector Machine (SVM) is a classifier that is defined using a separating hyperplane between the classes. This hyperplane is the N-dimensional version of a line. Given labeled training data and a binary classification problem, the SVM finds the optimal hyperplane that separates the training data into two classes. This can easily be extended to the problem with N classes. 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import cross_validate

In [5]:
# Input file containing data
input_file = 'income_data.txt'

In [9]:
# Read the data
X = []
y = []
count_class1 = 0
count_class2 = 0
max_datapoints = 25000

In [10]:
# Open the file and start reading the lines
with open(input_file, 'r') as f:
    for line in f.readlines():
        if count_class1 >= max_datapoints and \
        count_class2 >= max_datapoints:
            break
        if '?' in line:
            continue
            
        data = line[:-1].split(', ')
        
        # '<=50K' is based on the representation in the dataset
        if data[-1] == '<=50K' and count_class1 < max_datapoints:
            X.append(data)
            count_class1 += 1
            
        if data[-1] == '>50K' and count_class2 < max_datapoints:
            X.append(data)
            count_class2 += 1

In [11]:
# Convert to numpy array
X = np.array(X)

If any attribute is a string then we need to encode it while if it is a number we can keep it as is. This will result in multiple encoders and we'd need to keep track of all of them.

In [20]:
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
        
x = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

In [21]:
X

array([['39', 'State-gov', '77516', ..., '40', 'United-States', '<=50K'],
       ['50', 'Self-emp-not-inc', '83311', ..., '13', 'United-States',
        '<=50K'],
       ['38', 'Private', '215646', ..., '40', 'United-States', '<=50K'],
       ...,
       ['58', 'Private', '151910', ..., '40', 'United-States', '<=50K'],
       ['22', 'Private', '201490', ..., '20', 'United-States', '<=50K'],
       ['52', 'Self-emp-inc', '287927', ..., '40', 'United-States',
        '>50K']], dtype='<U26')

In [22]:
X_encoded

array([[3.90000e+01, 5.00000e+00, 7.75160e+04, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       [5.00000e+01, 4.00000e+00, 8.33110e+04, ..., 1.30000e+01,
        3.80000e+01, 0.00000e+00],
       [3.80000e+01, 2.00000e+00, 2.15646e+05, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       ...,
       [5.80000e+01, 2.00000e+00, 1.51910e+05, ..., 4.00000e+01,
        3.80000e+01, 0.00000e+00],
       [2.20000e+01, 2.00000e+00, 2.01490e+05, ..., 2.00000e+01,
        3.80000e+01, 0.00000e+00],
       [5.20000e+01, 3.00000e+00, 2.87927e+05, ..., 4.00000e+01,
        3.80000e+01, 1.00000e+00]])

In [30]:
le = preprocessing.LabelEncoder()
list(le.inverse_transform(range(5)))

NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [33]:
?preprocessing.LabelEncoder

In [31]:
# Create SVM classifier
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

In [32]:
?classifier