# Salary Prediction
https://www.hackerearth.com/practice/machine-learning/data-manipulation-visualisation-r-python/tutorial-data-manipulation-numpy-pandas-python/tutorial/

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Observation<br/>
- Rows = 32561<br/>
- columns = 15<br/>
<br/>
- integer columns = 6<br/>
- object columns = 9<br/>

In [4]:
print('Shape of train data:')
print(train.shape)

Shape of train data:
(32561, 15)


In [5]:
print('Shape of test data:')
print(test.shape)

Shape of test data:
(16281, 15)


In [6]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# Check for missing values
nans = train.shape[0] - train.dropna().shape[0]
print ("%d rows have missing values in the train data" %nans)

nand = test.shape[0] - test.dropna().shape[0]
print ("%d rows have missing values in the test data" %nand)


2399 rows have missing values in the train data
1221 rows have missing values in the test data


In [8]:
# Check for null values in each column
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

__Observation__<br/>
Only 3 columns have missing values<br/>
- Workclass has 1836 missing values
- Occupation has 1843 missing values
- Native.country has 583 missing values

In [9]:
# Let's count the number of unique values from character variables
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

In [10]:
# Notice that the Workclass, Occupation and Native.country are object type columns and have missing values
# Let us impute the missing values with 'mode'

# Education
train.workclass.value_counts(sort=True)

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [11]:
#Occupation
train.occupation.value_counts(sort=True)

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64

In [12]:
#Native Country
train['native.country'].value_counts(sort=True)

 United-States                 29170
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 Greece                           29
 

In [13]:
# Let us impute the columns
train.workclass.fillna('Private',inplace=True)
train.occupation.fillna('Prof-specialty',inplace=True)
train['native.country'].fillna('United-States',inplace=True)

In [14]:
# Verify for null values
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [15]:
#check proportion of target variable
train.target.value_counts()/train.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

In [16]:
# Show no. of persons in education-wise
pd.crosstab(train.education, train.target, margins=True)

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,871,62,933
11th,1115,60,1175
12th,400,33,433
1st-4th,162,6,168
5th-6th,317,16,333
7th-8th,606,40,646
9th,487,27,514
Assoc-acdm,802,265,1067
Assoc-voc,1021,361,1382
Bachelors,3134,2221,5355


In [17]:
# Show ratio of persons education-wise
edVsSal = pd.crosstab(train.education, train.target,margins=True)/train.shape[0]
edVsSal

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,0.02675,0.001904,0.028654
11th,0.034243,0.001843,0.036086
12th,0.012285,0.001013,0.013298
1st-4th,0.004975,0.000184,0.00516
5th-6th,0.009736,0.000491,0.010227
7th-8th,0.018611,0.001228,0.01984
9th,0.014957,0.000829,0.015786
Assoc-acdm,0.024631,0.008139,0.032769
Assoc-voc,0.031357,0.011087,0.042443
Bachelors,0.09625,0.06821,0.164461


In [18]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
chart = sns.lineplot(x=edVsSal.index, y=edVsSal[' <=50K'], data=edVsSal)
chart = sns.lineplot(x=edVsSal.index, y=edVsSal[' >50K'], data=edVsSal)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

[Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, ''),
 Text(0, 0, '')]

Observations:<br/>
- persons with Salary <= 50K are 76%
- persons with Salary > 50K are 24%<br/>
We see that out of 75% people with <=50K salary, 27% people are high school graduates, which is correct as people with lower levels of education are expected to earn less. On the other hand, out of 25% people with >=50K salary, 6% are bachelors and 5% are high-school grads. Now, this pattern seems to be a matter of concern. That's why we'll have to consider more variables before coming to a conclusion.

In [19]:
#load sklearn and encode all object type variables
from sklearn import preprocessing

for col in train.columns:
    if train[col].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values))
        train[col] = lbl.transform(list(train[col].values))

In [20]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [21]:
# we can see, all the variables have been converted to numeric, including the target variable
train.target.value_counts()

0    24720
1     7841
Name: target, dtype: int64

In [22]:
# Let's create a random forest model and check the model's accuracy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

y = train['target']
del train['target']

X = train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# train the RF classifier
clf = RandomForestClassifier(n_estimators=500, max_depth=6)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_split=1e-07, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
clf.predict(X_test)

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
# Make prediction and check model's accuracy

prediction = clf.predict(X_test)
acc = accuracy_score(np.array(y_test), prediction)
print('The accuracy of Random Forest is {}'.format(acc))

The accuracy of Random Forest is 0.8523902139420616


In [24]:
# So far we've worked with train data and split it into train and test and applied RandomForesetClassification.
# We will do the same to the give test data
test.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        966
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    274
target              0
dtype: int64

In [25]:
# Let us impute the values found on train data

test.workclass.fillna('Private',inplace=True)
test.occupation.fillna('Prof-specialty',inplace=True)
test['native.country'].fillna('United-States',inplace=True)

In [26]:
# Check to make sure no more missing values
test.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [27]:
#load sklearn and encode all object type variables
from sklearn import preprocessing

for col in test.columns:
    if test[col].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(test[col].values))
        test[col] = lbl.transform(list(test[col].values))

In [28]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,25,3,226802,1,7,4,6,3,2,1,0,0,40,37,0
1,38,3,89814,11,9,2,4,0,4,1,0,0,50,37,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,37,1
3,44,3,160323,15,10,2,6,0,2,1,7688,0,40,37,1
4,18,8,103497,15,10,4,14,3,4,0,0,0,30,37,0


In [29]:
# Let us apply the RandomForest to test data
X = train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# train the RF classifier
clf = RandomForestClassifier(n_estimators=500, max_depth=6)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_split=1e-07, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

del test['target']
clf.predict(test)

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)