# Attempt to predict actual wages based on a minimalist survey

We will start by simply inspecting some basic information regarding the data, select a few columns to use as predictors for 'fnlwgt'.

In [1]:
import numpy as np
import pandas as pd

## Read and Explore

Below we simply read the data, print out the columns, view the first 5 rows and check out some basic numerical statistics.  We also view the full scope of responses we have.

For now we are going to simply ignore the distribution of data in our data set.

In [2]:
DATA_FILE_PATH = 'C:/data/AdultCensusIncome.csv'
data = pd.read_csv(DATA_FILE_PATH)
print(data.columns.values)
print('###########')
print(data[0:5])
print('###########')
print(data.describe())

['age' ' workclass' ' fnlwgt' ' education' ' education-num'
 ' marital-status' ' occupation' ' relationship' ' race' ' sex'
 ' capital-gain' ' capital-loss' ' hours-per-week' ' native-country'
 ' income']
###########
   age          workclass   fnlwgt   education   education-num  \
0   39          State-gov    77516   Bachelors              13   
1   50   Self-emp-not-inc    83311   Bachelors              13   
2   38            Private   215646     HS-grad               9   
3   53            Private   234721        11th               7   
4   28            Private   338409   Bachelors              13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black    

In [3]:
numeric_cols = data.describe().columns.values
print(numeric_cols)

['age' ' fnlwgt' ' education-num' ' capital-gain' ' capital-loss'
 ' hours-per-week']


In [4]:
categorical_df = data.drop(numeric_cols, axis=1)
print(categorical_df.columns.values)

[' workclass' ' education' ' marital-status' ' occupation' ' relationship'
 ' race' ' sex' ' native-country' ' income']


In [5]:
for col in categorical_df.columns.values:
    print('------- COLUMN: ' + col + ' ----------')
    unique_vals = set(data[col])
    print(unique_vals)
    print('num_unique: ' + str(len(unique_vals)))

------- COLUMN:  workclass ----------
{' Private', ' Without-pay', ' Self-emp-not-inc', ' Never-worked', ' State-gov', ' Self-emp-inc', ' ?', ' Local-gov', ' Federal-gov'}
num_unique: 9
------- COLUMN:  education ----------
{' 7th-8th', ' Assoc-acdm', ' Bachelors', ' Prof-school', ' Masters', ' 9th', ' Doctorate', ' 10th', ' 1st-4th', ' 12th', ' Preschool', ' Assoc-voc', ' 11th', ' 5th-6th', ' HS-grad', ' Some-college'}
num_unique: 16
------- COLUMN:  marital-status ----------
{' Separated', ' Widowed', ' Married-AF-spouse', ' Married-spouse-absent', ' Never-married', ' Married-civ-spouse', ' Divorced'}
num_unique: 7
------- COLUMN:  occupation ----------
{' Machine-op-inspct', ' Tech-support', ' Adm-clerical', ' Sales', ' Protective-serv', ' Handlers-cleaners', ' Other-service', ' Craft-repair', ' ?', ' Armed-Forces', ' Transport-moving', ' Prof-specialty', ' Exec-managerial', ' Farming-fishing', ' Priv-house-serv'}
num_unique: 15
------- COLUMN:  relationship ----------
{' Not-in-fam

## Lets build a survey and think of questions we want to ask.

Age, Sex, Race, Education, workclass, relation, hours per week.  

First step is to build a data frame with just that data AND the column we want to predict.

In [6]:
#
# Start by getting just the data we want.
#

#Some data isn't very clean, so add extra spaces to the front of the column names.
prediction_df = data[['age', ' sex', ' race', ' education', ' workclass', ' relationship', ' hours-per-week', ' fnlwgt']]

#
# Lets go ahead an also clean up the values in each column.
#
for col in prediction_df.columns.values:
    if(prediction_df[col].dtype == 'object'):
        prediction_df[col] = prediction_df[col].apply(lambda x: x.strip())

print(prediction_df[0:5])

   age     sex   race  education         workclass   relationship  \
0   39    Male  White  Bachelors         State-gov  Not-in-family   
1   50    Male  White  Bachelors  Self-emp-not-inc        Husband   
2   38    Male  White    HS-grad           Private  Not-in-family   
3   53    Male  Black       11th           Private        Husband   
4   28  Female  Black  Bachelors           Private           Wife   

    hours-per-week   fnlwgt  
0               40    77516  
1               13    83311  
2               40   215646  
3               40   234721  
4               40   338409  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [7]:
# We one hot encode ONLY categorical data.  
#
# We need to One-Hot Encode data.  To do this, first we need to build a class mapping
# We will extract information from this, and then transform into the proper representation
#

#get mappings
age_map = list(set(prediction_df['age']))
sex_map = list(set(prediction_df[' sex']))
race_map = list(set(prediction_df[' race']))
ed_map = list(set(prediction_df[' education']))
work_map = list(set(prediction_df[' workclass']))
rel_map = list(set(prediction_df[' relationship']))

def One_Hot(value, mapping):
    '''Takes a value and a mapping.  Returns the one hot encoded representation of the value.'''
    return np.eye(len(mapping))[mapping.index(value)]

# Apply the encodings to each column
prediction_df['age'] = prediction_df['age'].apply(lambda x: One_Hot(x, age_map))
prediction_df[' sex'] = prediction_df[' sex'].apply(lambda x: One_Hot(x, sex_map))
prediction_df[' race'] = prediction_df[' race'].apply(lambda x: One_Hot(x, race_map))
prediction_df[' education'] = prediction_df[' education'].apply(lambda x: One_Hot(x, ed_map))
prediction_df[' workclass'] = prediction_df[' workclass'].apply(lambda x: One_Hot(x, work_map))
prediction_df[' relationship'] = prediction_df[' relationship'].apply(lambda x: One_Hot(x, rel_map))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [8]:
print(prediction_df)

                                                     age         sex  \
0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [0.0, 1.0]   
5      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [0.0, 1.0]   
6      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [0.0, 1.0]   
7      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
8      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [0.0, 1.0]   
9      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
10     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
11     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  [1.0, 0.0]   
12     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...  [0.0, 

## Prepare for Machine Learning

Seperate our features from our prediction, Convert to Numpy Matrices and then split data

Train, Test Split

In [9]:
#
# Here we seperate into 'features' or predictors - X
# and 'labels' or truths - Y
#
Y = prediction_df[' fnlwgt']
X = prediction_df.drop([' fnlwgt'], axis=1)
print('------FULL FRAME------')
print(prediction_df.loc[0])
print('-----JUST Y---------')
print(Y.loc[0])
print('-----JUST X--------')
print(X.loc[0])

------FULL FRAME------
age                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 sex                                                      [1.0, 0.0]
 race                                      [0.0, 0.0, 0.0, 0.0, 1.0]
 education         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 workclass             [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 relationship                         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
 hours-per-week                                                   40
 fnlwgt                                                        77516
Name: 0, dtype: object
-----JUST Y---------
77516
-----JUST X--------
age                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 sex                                                      [1.0, 0.0]
 race                                      [0.0, 0.0, 0.0, 0.0, 1.0]
 education         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
 workclass             [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 relations

In [10]:
#
# Now we convert our X to a numerical matrix
#

def Row_To_Array(row):
    '''Helper function to convert a row to a single array'''
    row_arr = np.empty(shape=0)
    for i in row.iteritems():
        row_arr = np.append(row_arr, i[1])
    return row_arr

#Lets try it out and see our shapes...
for idx, row in X[0:2].iterrows():
    print(Row_To_Array(row).shape)
    print(idx)
print(X.shape)

(112,)
0
(112,)
1
(32561, 7)


### So we have 112 values in 32,561 rows.

Our old data frame had 32561 rows and 7 columns.
We need to build a new matrix with 32561 rows, but 112 values, where the last value is our prediction value.

In [11]:
#
# Actually convert X to a matrix
#
matrix = np.empty(shape=(32561,112))

for idx, row in X.iterrows():
    matrix[idx] = Row_To_Array(row)

In [12]:
#
# Convert Y to a matrix
#

Y = Y.as_matrix()

In [13]:
#Quick Spot check.
print(matrix.shape)
print(matrix[0])
print(Y[0])

(32561, 112)
[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.
   0.   0.   0.   0.   1.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   1.   0.   0.   0.  40.]
77516


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(matrix, Y, train_size = 0.8, test_size = 0.2)

In [16]:
#Show that it splits effectively.
print(len(X))
print(len(x_train))
print(len(x_test))
print(len(x_train) + len(x_test))

32561
26048
6513
32561


In [17]:
from sklearn.linear_model import LinearRegression
LinReg = LinearRegression(n_jobs = -1)
LinReg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [18]:
# Calculate metrics
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error, r2_score
import math

# Get some predictions
predictions = LinReg.predict(x_test)

rmse = math.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
med_ae = median_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)


print('RMSE: {}'.format(rmse))
print('MEAN AE: {}'.format(mae))
print('MEDIAN AE: {}'.format(med_ae))
print('R2: {}'.format(r2))

RMSE: 103580.22470316596
MEAN AE: 76043.2007294718
MEDIAN AE: 60349.10135689686
R2: 0.02761449543342631


In [19]:
#
# You can get your weights; which can be very interesting for understanding
# most positively or negatively impactful features
#
print(LinReg.coef_)

[  6795.36801413  18973.86010804  30280.91119152  23987.94684451
  23930.08100851  26572.69142684  27572.30270944  26212.95828645
  20477.10053692  11965.5412312   16848.59768887  10472.60043789
  12396.71710406  12604.79017975  17893.10396096  14756.36915534
  17725.3213672   16165.88181015  14973.83741569  11625.28806353
  10014.38324268   2880.54990453   9879.09877979   9221.31357131
   1994.59866922   7913.36131535   8614.92954199  10713.31485042
   3888.67643494  11363.57888695   7643.32832974  -1802.67161353
  -4963.68937659 -10391.54852831  -9675.47336693  -3476.00947351
  -3918.67862604  -2461.93352148    524.59854369   4487.26616046
    779.77869626   5191.23950182    656.09761008 -14661.29655202
 -11648.11908304 -17537.19244989 -11453.73049932  -3256.29133197
  -5625.1806502   -5682.98695022 -11640.59080707  -2324.15217311
  -5389.68623753  11713.60962725 -10831.15221882   5006.14651155
 -17096.32343713 -20360.61018504 -28847.17532987 -11982.87941703
 -19461.57030717  -1410.4