In [2]:
## Import all of our necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
## Read in our data from the CSV file and then look at the top five values
data = pd.read_csv('data.csv')

data.head(5)

In [3]:
## Look at some stats on our data
data.describe()

In [4]:
## Remove the columns that are not necessary for us
data.pop('fnlwgt')
data.pop('capital-gain')
data.pop('capital-loss')

data.head(5)
## Looking better now!

In [5]:
## Create some Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

data_matrix = data.pivot_table(index='race', columns='educational-num', values='income_>50K')

data_matrix.head()

fig = plt.figure(figsize=(15,5))
heat_map = sns.heatmap(data_matrix)
heat_map.set_title("Heatmap of Higher Salary pairing Race and Highest Level of Education Completed")
heat_map.set_xlabel("Highest Education Completed (As Number)")
heat_map.set_ylabel("Race")

## Uncomment the two lines below if you wish to save the graph as an image
##graph = heat_map.get_figure()
##graph.savefig("heatmap.png") 

In [9]:
##Create a scatterplot
fig = plt.figure(figsize=(20,10))
scatter_plot = sns.scatterplot(x="hours-per-week", y="age", hue="race", alpha=.5, palette="muted", data=data)
scatter_plot.set_title("Scatterplot of Correlation between Age and Work Hours per Week")
scatter_plot.set_xlabel("Amount of Work Hours per Week")
scatter_plot.set_ylabel("Age (Years)")

## Uncomment the two lines below if you wish to save the graph as an image
##graph = scatter_plot.get_figure()
##graph.savefig("scatter-plot.png") 

In [7]:
##Create a bar graph
plt.figure(figsize=(25,10))
bar_graph = sns.countplot(data=data, x='occupation')

bar_graph.set_title("Bar Graph of Distribution of Occupations")


## Uncomment the two lines below if you wish to save the graph as an image
##graph = bar_graph.get_figure()
##graph.savefig("bar-graph.png") 

In [None]:
## Now rid of the education as number
data.pop('educational-num') # Already provided as a String

In [None]:
## Calculate Dummy Values for the Data
data = pd.get_dummies(data)
data.head(5)

In [None]:
## Extract the Data Label from the Training Set
data_labels = np.array(data.pop('income_>50K'))

In [None]:
## Split our Data into the appropriate sets
train, test, train_labels, test_labels = train_test_split(data, data_labels, stratify = data_labels, test_size = 0.25, random_state = 42)

In [None]:
## Fill in our missing values
train = train.fillna(train.mean())
test = test.fillna(test.mean())

In [None]:
## Create our model with 100 trees
model = RandomForestClassifier(n_estimators=100, random_state=42, max_features = 'sqrt', n_jobs=-1, verbose = 1)

In [None]:
## TRAIN THE MODEL!!!
model.fit(train, train_labels)

In [None]:
# Use the forest's predict method on the test data
predictions = model.predict(test)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
#Export our model so that we can use it in our API
import joblib
joblib.dump(model, 'model.pkl')

In [None]:
#Export our model columns.
# Necessary, because there would be a LOT of duplicate lines written
model_columns = list(train.columns)
joblib.dump(model_columns, 'model_columns.pkl')