In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn import model_selection

# Input file containing data
input_file = 'income_data.txt'

# In order to load the data from the file, we need to preprocess it so that we can prepare it for
# classification. We will use at most 25,000 data points for each class:
# Read the data
X = []
y = []
count_class1 = 0
count_class2 = 0
max_datapoints = 25000

# Open the file and start reading the lines:
with open(input_file, 'r') as f:
    for line in f.readlines():
        if count_class1 >= max_datapoints and count_class2 >=max_datapoints:
            break
            
        if '?' in line:
            continue
            
        # Each line is comma separated, so we need to split it accordingly. The last element in each
        # line represents the label. Depending on that label, we will assign it to a class:

        data = line[:-1].split(', ')

        if data[-1] == '<=50K' and count_class1 < max_datapoints:
            X.append(data)
            count_class1 += 1
    
        if data[-1] == '>50K' and count_class2 < max_datapoints:
            X.append(data)
            count_class2 += 1
    
# Convert the list into a numpy array so that we can give it as an input to the sklearn function:
# Convert to numpy array
X = np.array(X)

# If any attribute is a string, then we need to encode it. If it is a number, we can keep it as it is.
# Note that we will end up with multiple label encoders and we need to keep track of all of them:

# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
        
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)


# Create SVM classifier with a linear kernel
classifier = OneVsOneClassifier(LinearSVC(random_state=0))

# Train the classifier
classifier.fit(X, y)

# Perform cross validation using an 80/20 split for training and testing, and then predict the output for training data:
# Cross validation
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,test_size=0.2, random_state=5)
classifier = OneVsOneClassifier(LinearSVC(random_state=0))
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)


# Compute the F1 score of the SVM classifier
f1 = model_selection.cross_val_score(classifier, X, y,scoring='f1_weighted', cv=3)
print("F1 score: " + str(round(100*f1.mean(), 2)) + "%")



F1 score: 70.82%




In [15]:
# Now that the classifier is ready, let's see how to take a random input data point and predict
# the output. Let's define one such data point:
# Predict output for a test datapoint

input_data = ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married','Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40','United-States']

# Before we can perform prediction, we need to encode this data point using the label encoders we created earlier:
# Encode test datapoint

input_data_encoded = [-1] * len(input_data)
count = 0
for i, item in enumerate(input_data):
    if item.isdigit():
        input_data_encoded[i] = int(input_data[i])
    else:
        input_data_encoded[i] = int(label_encoder[count].transform(input_data[i]))
        count += 1

input_data_encoded = np.array(input_data_encoded)

# We are now ready to predict the output using the classifier:
# Run classifier on encoded datapoint and print output
predicted_class = classifier.predict(input_data_encoded)
print(label_encoder[-1].inverse_transform(predicted_class)[0])


ValueError: bad input shape ()