In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading the data
train=pd.read_csv("data/train.csv")
test=pd.read_csv("data/test.csv")

In [3]:
#checking out the data-set
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
print("Number of rows and columns in training data: ",train.shape)
print("Number of rows and columns in test data: ",test.shape)

('Number of rows and columns in training data: ', (32561, 15))
('Number of rows and columns in test data: ', (16281, 15))


In [5]:
#ok let me see what does the data look like
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Let's check for missing values in the data(if present)

In [6]:
train_missing=train.shape[0]-train.dropna().shape[0]
print("%d rows have missing values in the training data" % train_missing)
test_missing=test.shape[0]-test.dropna().shape[0]
print("%d rows have missing values in the test data" % test_missing)

2399 rows have missing values in the training data
1221 rows have missing values in the test data


#### Digging deeper and finding out which columns have missing values.

In [7]:
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

Counting the number of unique values from cahracter variables

In [8]:
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

Since missing values are found in all the 3 character variables, let's impute these missing values with their respective modes.

In [9]:
#Workclass
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)

#occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-speciality',inplace=True)

#Native Country 
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)


Let's check if there are any missing values lift

In [10]:
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

Now checking the target variable if this data is imbalanced or not

In [11]:
train.target.value_counts()/train.shape[0]  #proportion of target variable

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

ok. So we can clearly see that 75% of our data belongs to less than $50k class. This implies that even if we took our wildest guess at target prediction as less than 50k, we will get accuracy of 75%. 
Will now create a cross tab of the target variable with education. With this, we will try to understand the influence of education on the target variable.

In [12]:
#every figure below is in %age
crosstab=pd.crosstab(train.education,train.target,margins=True)/train.shape[0]
crosstab*100

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,2.674979,0.190412,2.865391
11th,3.424342,0.18427,3.608612
12th,1.228463,0.101348,1.329812
1st-4th,0.497528,0.018427,0.515955
5th-6th,0.973557,0.049139,1.022696
7th-8th,1.861122,0.122846,1.983969
9th,1.495654,0.082921,1.578576
Assoc-acdm,2.463069,0.813857,3.276926
Assoc-voc,3.135653,1.108688,4.244341
Bachelors,9.625012,6.821044,16.446055


So, clearly we can see that out of 75% of people with <=50k salary , 27% of people are high school graduates. Now coming to the rest, out of 25% people with >=50k salary 6.82% have bachelors and 5% are high school grads. Well, this pattern seems fishy.To tackle this, I am gonna consider some more variables before coming to a conclusion. I will now convert the character variable into numeric value. For this I will use the LabelEncoder function of scikit.learn library. In label encoding, each unique value of a variable gets assigned a number, i.e. let us say a variable color has four values['red','green','blue','pink']. Label encoding this variable will return output as red=2, green=0, blue=1, pink=2

In [13]:
from sklearn import preprocessing

for x in train.columns:
    if train[x].dtype=='object':
        label=preprocessing.LabelEncoder()
        label.fit(list(train[x].values))
        train[x]=label.transform(list(train[x].values))


In [14]:
#checking if values are encoded
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [15]:
#hoping target var is also encoded
train.target.value_counts()

0    24720
1     7841
Name: target, dtype: int64

Will now proceed to build arandom forest model and will check model accuracy 

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score



In [17]:
y=train['target']
del train['target']

X=train
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

Training the random forest classfier

In [18]:
classifier=RandomForestClassifier(n_estimators=500,max_depth=6)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Now, making a prediction on the test set and checking the model's accuracy

In [20]:
prediction=classifier.predict(X_test)
accuracy=accuracy_score(np.array(y_test),prediction)
print('The accuracy of our Random Forest model is {}'.format(accuracy))

The accuracy of our Random Forest model is 0.852083120074


So, clearly I am getting 85% accuracy. If you can make it better, Fork it and improve it