# Census Dataset

Now, let's use a dataset with even more attributes, now with some of them being categorical (i.e. not numerical. e.g. "married" vs "divorced" vs "single").

In [None]:
# Numerical library in python, gives us nice things like special kinds of arrays,
# useful math functions, random numbers, etc.
import numpy as np

# Pandas library gives us Series data and DataFrames, which are two really useful ways
# of representing data. Using pandas, we can easily filter, sort, or extract data.
# Very easy to convert this data into native numpy or native Python objects.
import pandas as pd

# Matplotlib is a standard library used in Data Science,
# especially pyplot
import matplotlib.pyplot as plt

# Scikit-learn includes everything we need to take our data and run Supervised Learning
# or Unsupervised Learning algorithms on it.
import sklearn

# Import our Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB

# Import our Neural Network Classifier (Multi-Layer Perceptron Classifier)
from sklearn.neural_network import MLPClassifier

# Import our KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# Import our Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Import our Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

# Import our Support Vector Machine Classifier
from sklearn.svm import SVC

# We'll use this at the end for evaluating our models
from sklearn.metrics import confusion_matrix


In [None]:

# Use this new cleaned up dataset.
dataset = "datasets/full_census.csv"

# We can read in a properly formatted CSV using this helper function.
# This reads it in as a pandas dataframe object, which gives us a lot
# of the same functionality we get in Excel like filtering, sorting, etc
df = pd.read_csv(dataset)

# This head() function gives us the first 5 items, which Jupyter notebook
# formats nicely for us.
df.head()

In [None]:
# We can also use the describe function on a dataframe to get info on the categorical variables
df.describe()


## Your Task

Build a classifier using this dataset and all of the attributes to make a classifier to determine if male or female. Use [this link](http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/) to determine how to convert categorical fields into numerical fields for use with Neural Networks, Naive Bayes, and all the other supervised learning algorithms. It will be called __DictVectorizer.__

- Create test/train splits (again, 70% and 30%, respectively)
- Try at least 3 different parameters for your classifiers, plot your test/train errors
- Try at least 2 different ML algorithms: Neural Network (MLPClassifier), Naive Bayes, KNN, DecisionTree, DecisionForest

- Explain why your best performing model had such high test accuracy

In [None]:

#census_dict = df[ cols_to_retain ].to_dict( orient = 'records' )
#vectorizer = DictVectorizer( sparse = False )
#df = vectorizer.fit_transform( census_dict )
cols = []
y_value = " race"

cols = [' workclass', ' education', ' education-num', ' marital-status', ' occupation', 
        ' relationship', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country', 
        ' wealth_class'  ]

df = pd.get_dummies(data=df, columns=cols)

# This is how many examples we have.
n_samples = df.shape[0]
print(n_samples)

# TODO: Compute train_size and test_size
train_size = int(n_samples * 0.70)
test_size = n_samples - train_size

print("Training set size: {}".format(train_size))
print("Test set size: {}".format(test_size))


df_X = df.drop(y_value, axis=1)
df_y = df[y_value]

# We need to now create our test and train splits

# We'll train on 70% of our data
train_X = df_X[:train_size]
train_y = df_y[:train_size]
# The remaining samples are for our test set

test_X = df_X[train_size:]
test_y = df_y[train_size:]


epoch_counts = [5, 10, 100, 500, 1000, 2000] # We'll loop over this and set the MLP max iterations to this
classifiers = [] # Append your classifiers here after applying .fit() to them

train_accs = [] # Append train accuracies here
test_accs = [] # Append test accuracies here
results = []
for epoch_count in epoch_counts:
    # TODO: Use the epoch_count variable here, refer to MLP docs
    classifier = MLPClassifier(max_iter=epoch_count)
    # TODO: Use the fit() function here
    classifiers.append(classifier.fit(train_X, train_y))
    # TODO: Compute the accuracies below
    train_error = classifier.score(train_X, train_y)
    test_error = classifier.score(test_X, test_y)
    
    #results.append(classifier.predict(test_X))
    train_acc = train_error
    test_acc = test_error
    print("Train:", train_acc)
    print("Test", test_acc)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    # Now let's plot the results
print("done")    
#plt.plot(epoch_counts, train_accs, "g")
#plt.plot(epoch_counts, test_accs, "r")