#Load, format cleaned data:

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('/Users/fineiskid/Desktop/DSSG_ffineis/Presentation/frank_clean_data.csv', na_values='na',
                  keep_default_na = True)
data.head(5)

Unnamed: 0,aa_primarykey,agep,hincp,np,pap,veh,wkhp,hicov,rwat,cit,fs,mar,sex,dis,rac1p,mil,sch,wif,mv,cow
0,1,1.714639,-0.806771,-0.694199,-0.074189,-0.055156,,1.0,1,4,0,1,1,1,1,3,1,0,6,
1,2,0.860682,0.084138,-0.694199,-0.074189,-0.055156,,,1,1,0,1,0,0,1,5,1,1,3,1.0
2,3,-0.548348,-0.644095,-0.126756,-0.074189,-0.949111,0.159869,0.0,1,1,0,5,1,0,1,5,1,0,1,1.0
3,4,0.817984,-0.381016,-0.694199,-0.074189,-0.055156,0.159869,,1,1,0,1,1,0,1,5,1,1,6,5.0
4,5,0.860682,-0.360681,-0.694199,-0.074189,-0.055156,,,1,1,0,3,0,0,1,5,1,1,5,3.0


#Let's separate people with healthcare coverage (training data) from those without (test data), and then grab all of the clean training data (no missing values). Also, make sure Python knows you have categorical data.

#######Note: 
First we need to find the locations of the NA data and then separate, as it's really hard to identify NA values once we make the *hicov* data categorical...

In [2]:
#Separate known health coverage from unknown health coverage
data_known = data.loc[~np.isnan(data.hicov)] #Eventual training data
data_unknown = data.loc[np.isnan(data.hicov)] #Eventual test data

train_data = data_known.dropna(axis = 0)

test_data = data_unknown[data_unknown.columns[data_unknown.columns != 'hicov']].dropna(axis = 0)
test_data['hicov'] = np.nan

In [3]:
#Make categorical data astype('category')
train_data.loc[:,"cit"] = train_data.loc[:,"cit"].astype("category")
train_data.loc[:,"dis"] = train_data.loc[:,"dis"].astype("category")
train_data.loc[:,"fs"] = train_data.loc[:,"fs"].astype("category")
train_data.loc[:,"hicov"] = train_data.loc[:,"hicov"].astype("category")
train_data.loc[:,"mar"] = train_data.loc[:,"mar"].astype("category")
train_data.loc[:,"rac1p"] = train_data.loc[:,"rac1p"].astype("category")
train_data.loc[:,"rwat"] = train_data.loc[:,"rwat"].astype("category")
train_data.loc[:,"mil"] = train_data.loc[:,"mil"].astype("category")
train_data.loc[:,"sex"] = train_data.loc[:,"sex"].astype("category")

test_data.loc[:,"cit"] = test_data.loc[:,"cit"].astype("category")
test_data.loc[:,"dis"] = test_data.loc[:,"dis"].astype("category")
test_data.loc[:,"fs"] = test_data.loc[:,"fs"].astype("category")
test_data.loc[:,"hicov"] = test_data.loc[:,"hicov"].astype("category")
test_data.loc[:,"mar"] = test_data.loc[:,"mar"].astype("category")
test_data.loc[:,"rac1p"] = test_data.loc[:,"rac1p"].astype("category")
test_data.loc[:,"rwat"] = test_data.loc[:,"rwat"].astype("category")
test_data.loc[:,"mil"] = test_data.loc[:,"mil"].astype("category")
test_data.loc[:,"sex"] = test_data.loc[:,"sex"].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


## hashtag basic, let's fit a simple linear support vector machine classification model.
For efficiency, let's just fit it/test it a small subset of the known training data.

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
import random
size = 10000

# for efficiency: select size-many samples from training_data.
samp = random.sample(range(train_data.shape[0]), size)

y = train_data.hicov.as_matrix()[samp]
X = train_data.drop(['hicov', 'aa_primarykey'], axis = 1).as_matrix()[samp,:]

# Split data! test_size: the proportion of the dataset to include in the test split. Default is 0.25.
Xtrain, Xtest, ytrain, answer = train_test_split(X,y, test_size = 0.33, random_state = 20)

from sklearn.svm import SVC # "Support Vector Classifier"
clf = SVC(kernel='linear')
clf.fit(Xtrain, ytrain)

dist = clf.decision_function(Xtest)
pred = clf.predict(Xtest)

error_rate = sum(np.abs(pred-answer))/len(answer)
print('Prediction error rate = {0}%'.format(error_rate*100))
                      

Prediction error rate = 12.9696969697%


In [6]:
y = train_data.hicov.as_matrix()
X = train_data.drop(['hicov', 'aa_primarykey'], axis = 1).as_matrix()

In [9]:
y[0:50]
X[0:50,:]

array([[-0.5483482814689999, -0.644094657317, -0.12675623689,
        -0.07418891462010001, -0.949110626624, 0.159868607687, 1.0, 1, 0,
        5, 1, 0, 1, 5.0, 1.0, 0.0, 1.0, 1.0],
       [-0.249463124505, -0.227235575979, 0.440686489758,
        -0.07418891462010001, -0.0551562532197, 0.159868607687, 1.0, 1, 0,
        5, 0, 0, 2, 5.0, 1.0, 2.0, 1.0, 1.0],
       [-0.164067365373, -0.0810807151446, 0.440686489758,
        -0.07418891462010001, -0.0551562532197, 0.538679805195, 1.0, 1, 0,
        1, 0, 0, 1, 5.0, 1.0, 2.0, 4.0, 3.0],
       [-0.676441920168, -0.15733542514499999, 0.440686489758,
        -0.07418891462010001, 1.7327524935900003, 0.159868607687, 1.0, 1,
        0, 5, 1, 0, 1, 5.0, 1.0, 3.0, 7.0, 1.0],
       [-0.206765244939, 0.415845811694, 1.57557194306,
        -0.07418891462010001, 0.838798120185, -0.5977537873300001, 1.0, 5,
        0, 1, 1, 0, 1, 5.0, 1.0, 1.0, 2.0, 6.0],
       [0.775285985086, -0.341617640981, -0.694198963539,
        -0.07418891462010001, -0.05