In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import  RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error 
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')

In [4]:
train=pd.read_csv('train.csv',sep=',')
test=pd.read_csv('test.csv',sep=',')

In [7]:
train.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,2,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,3,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,4,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,5,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


Predictive Modeling
Since this is a classification problem, we can start with various models like logistic regression, naive bayes, decision trees, etc. Here we will run a decision tree model as an example.

For modeling using sklearn in Pandas, we have to adopt the following steps:

Data Preprocessing: sklearn accepts only numeric data so we've to convert text to numbers
Training model: fit the model on training data
Making predictions: use the model to make predictions on test data
Analyze results: compare the train and test accuracy and derive insights required to improve the model
Lets go through these one by one.

 

In [75]:
from sklearn.preprocessing import LabelEncoder

In [76]:
categorical_variables=train.dtypes.loc[train.dtypes=='object'].index

In [77]:
categorical_variables

Index([], dtype='object')

In [78]:
train['Occupation']=train['Occupation'].fillna("idk")
train['Workclass']=train['Workclass'].fillna('il')
train['Native.Country']=train['Native.Country'].fillna('oo')
train['Income.Group']=train['Income.Group'].fillna('ii')

In [79]:
from scipy.stats import mode

In [80]:
var_to_impute=['Workclass','Occupation','Native.Country','Income.Group']

for var in var_to_impute:
    train[var].fillna(mode(train[var]).mode[0],inplace=True)
    
    

In [128]:
categorical_t=test.dtypes.loc[test.dtypes=='object'].index

In [129]:
categorical_t

Index(['Hours.Per.Week'], dtype='object')

In [130]:
test['Occupation']=test['Occupation'].fillna("idk")
test['Workclass']=test['Workclass'].fillna('il')
test['Native.Country']=test['Native.Country'].fillna('oo')
test['Race']=test['Race'].fillna('ii')
test['Sex']=test['Sex'].fillna('ii')
test['Relationship']=test['Relationship'].fillna('i')
test['Hours.Per.Week']=test['Hours.Per.Week'].fillna(np.mean)

In [136]:
test.apply(lambda x:np.sum(x.isnull()))

ID                0
Age               0
Workclass         0
Education         0
Marital.Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Hours.Per.Week    0
Native.Country    0
dtype: int64

In [133]:
var_to_impute=['Workclass','Occupation','Native.Country','Relationship', 'Race', 'Sex']

for var in var_to_impute:
    test[var].fillna(mode(test[var]).mode[0],inplace=True)

In [127]:
test.dtypes

ID                 int64
Age                int64
Workclass          int64
Education          int64
Marital.Status     int64
Occupation         int64
Relationship       int64
Race               int64
Sex                int64
Hours.Per.Week    object
Native.Country     int64
dtype: object

In [140]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 11 columns):
ID                1372 non-null int64
Age               1372 non-null int64
Workclass         1372 non-null int64
Education         1372 non-null int64
Marital.Status    1372 non-null int64
Occupation        1372 non-null int64
Relationship      1372 non-null int64
Race              1372 non-null int64
Sex               1372 non-null int64
Hours.Per.Week    1372 non-null object
Native.Country    1372 non-null int64
dtypes: int64(10), object(1)
memory usage: 118.0+ KB


In [25]:
le=LabelEncoder()
for var in categorical_variables:
    train[var]=le.fit_transform(train[var])

In [34]:
train.dtypes

ID                int64
Age               int64
Workclass         int64
Education         int64
Marital.Status    int64
Occupation        int64
Relationship      int64
Race              int64
Sex               int64
Hours.Per.Week    int64
Native.Country    int64
Income.Group      int64
dtype: object

In [36]:
train.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,5,9,4,0,1,4,1,40,24,0
1,2,50,4,9,2,3,0,4,1,13,24,0
2,3,38,2,11,0,5,1,4,1,40,24,0
3,4,53,2,1,2,5,0,2,1,40,24,0
4,5,28,2,9,2,8,5,2,0,40,3,0


In [None]:
#fit the model

In [50]:
from sklearn.tree import DecisionTreeClassifier

In [51]:
dependent_variable='Income.Group'
independen_variable=[x for x in train.columns if x not in ['ID',dependent_variable]]

In [52]:
independen_variable

['Age',
 'Workclass',
 'Education',
 'Marital.Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Hours.Per.Week',
 'Native.Country']

Now that we have the predictors, lets run the model with the following benchmark parameters:

1.max_depth = 10

2.min_samples_leaf = 100

3.max_features = 'sqrt'

In [151]:
model=DecisionTreeClassifier()

In [152]:
model.fit(train[independen_variable],train[dependent_variable])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [153]:
pred_train=model.predict(train[independen_variable])

In [155]:
from sklearn.metrics import  accuracy_score

In [156]:
acc=accuracy_score(train[dependent_variable],pred_train)

In [157]:
acc

0.9959677419354839

RANDOM FOREST

In [141]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
model2=RandomForestClassifier()

In [148]:
model2.fit(train[independen_variable],train[dependent_variable])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [149]:
pred_train2=model2.predict(train[independen_variable])

In [150]:
accuracy_score(pred_train2,train[dependent_variable])

0.9899193548387096