# Income Prediction

### Understanding and Cleaning the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.plot as plt
import seborn as sns

In [None]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading the csv file and putting it into 'df' object.
Income = pd.read_csv('adult_dataset.csv')

In [None]:
# Let's understand the type of values in each column of our dataframe 'df'.
Income.info()

In [None]:
Income.head()

In [None]:
# rows with missing values represented as'?'.
values = Income[Income.workclass == '?']
values

Now we can check the number of rows in 'values'

In [None]:
values.info()

There are 1836 rows with missing values, which is about 5% of the total data. We choose to simply drop these rows.

In [None]:
# dropping the rows having missing values in workclass
Income= Income[Income['workclass'] != '?']
Income.head()

Let's see whether any other columns contain a "?". Since "?" is a string, we can apply this check only on the categorical columns.

In [None]:
# select all categorical variables
cat = Income.select_dtypes(include=['object'])

# checking whether any other columns contain a "?"
cat.apply(lambda x: x=="?", axis=0).sum()

Thus, the columns occupation and native.country contain some "?"s. Let's get rid of them.

In [None]:
# dropping the "?"s
Income = Income[Income['occupation'] != '?']
Income= Income[Income['native.country'] != '?']

Now we have a clean dataframe which is ready for model building.<br>

# clean dataframe
Income.info()

## Data Preparation







In [None]:
from sklearn import preprocessing


# encode categorical variables using Label Encoder

# select all categorical variables
cat = Income.select_dtypes(include=['object'])
cat.head()

In [None]:
# apply Label encoder to df_categorical

le = preprocessing.LabelEncoder()
df_categorical = cat.apply(le.fit_transform)
df_categorical.head()

In [None]:
# concat df_categorical with original df
Income = Income.drop(cat.columns, axis=1)
Income = pd.concat([Income, cat], axis=1)
Income.head()

In [None]:
# look at column types
Income.info()

In [None]:
# convert target variable income to categorical
Income['income'] = Income['income'].astype('category')

Now all the categorical variables are suitably encoded. Let's build the model.

<hr>

### Model Building and Evaluation

Let's first build a decision tree with default hyperparameters. Then we'll use cross-validation to tune them.

In [None]:
# Importing train-test-split 
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = Income.drop('income',axis=1)

# Putting response variable to y
y = Income['income']

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state = 99)
X_train.head()

In [None]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dtree = DecisionTreeClassifier(max_depth=5)
dtree.fit(X_train, y_train)

In [None]:
# Let's check the evaluation metrics of our default model

# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_default = dtree.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

In [None]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

In [None]:
# Importing required packages for visualization
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

# Putting features
features = list(df.columns[1:])
features

In [None]:
# plotting tree with max_depth=3
dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data,
                feature_names=features, filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

<hr>

### Hyperparameter Tuning

In [None]:
# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_depth': range(1, 40)}

# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini", 
                               random_state = 100)

# fit tree on training data
tree = GridSearchCV(dtree, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
tree.fit(X_train, y_train)

In [None]:
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()

Now let's visualize how train and test score changes with max_depth.

In [None]:
# plotting accuracies with max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


You can see that as we increase the value of max_depth, both training and test score increase till about max-depth = 10, after which the test score gradually reduces. Note that the scores are average accuracies across the 5-folds. 

Thus, it is clear that the model is overfitting the training data if the max_depth is too high. Next, let's see how the model behaves with other hyperparameters.

<hr>

### Tuning min_samples_leaf

The hyperparameter **min_samples_leaf** indicates the minimum number of samples required to be at a leaf.<br>

So if the values of min_samples_leaf is less, say 5, then the will be constructed even if a leaf has 5, 6 etc. observations (and is likely to overfit).<br>

Let's see what will be the optimum value for min_samples_leaf.

In [None]:
# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_leaf': range(5, 200, 20)}

# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini", 
                               random_state = 100)

# fit tree on training data
tree = GridSearchCV(dtree, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
tree.fit(X_train, y_train)

In [None]:
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()

In [None]:
# plotting accuracies with min_samples_leaf
plt.figure()
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


### Tuning min_samples_split

In [None]:
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_split': range(5, 200, 20)}

# instantiate the model
dtree = DecisionTreeClassifier(criterion = "gini", 
                               random_state = 100)

# fit tree on training data
tree = GridSearchCV(dtree, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
tree.fit(X_train, y_train)

In [None]:
# scores of GridSearch CV
scores = tree.cv_results_
pd.DataFrame(scores).head()

In [None]:
# plotting accuracies with min_samples_leaf
plt.figure()
plt.plot(scores["param_min_samples_split"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_min_samples_split"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


This shows that as you increase the min_samples_split, the tree overfits lesser since the model is less complex.

<hr>

## Grid Search to Find Optimal Hyperparameters

We can now use GridSearchCV to find multiple optimal hyperparameters together. Note that this time, we'll also specify the criterion (gini/entropy or IG).

In [None]:
# Create the parameter grid 
param_grid = {
    'max_depth': range(5, 15, 5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),
    'criterion': ["entropy", "gini"]
}

n_folds = 5

# Instantiate the grid search model
dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
                          cv = n_folds, verbose = 1)

# Fit the grid search to the data
grid_search.fit(X_train,y_train)

In [None]:
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

In [None]:
# printing the optimal accuracy score and hyperparameters
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

<hr>

**Running the model with best parameters obtained from grid search.**

In [None]:
# model with optimal hyperparameters
clf_gini = DecisionTreeClassifier(criterion = "gini", 
                                  random_state = 100,
                                  max_depth=10, 
                                  min_samples_leaf=50,
                                  min_samples_split=50)
clf_gini.fit(X_train, y_train)

In [None]:
# accuracy score
clf_gini.score(X_test,y_test)

In [None]:
# plotting the tree
dot_data = StringIO()  
export_graphviz(clf_gini, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

You can see that this tree is too complex to understand. Let's try reducing the max_depth and see how the tree looks.

In [None]:
# tree with max_depth = 3
clf_gini = DecisionTreeClassifier(criterion = "gini", 
                                  random_state = 100,
                                  max_depth=3, 
                                  min_samples_leaf=50,
                                  min_samples_split=50)
clf_gini.fit(X_train, y_train)

# score
print(clf_gini.score(X_test,y_test))

In [None]:
# plotting tree with max_depth=3
dot_data = StringIO()  
export_graphviz(clf_gini, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
# classification metrics
from sklearn.metrics import classification_report,confusion_matrix
y_pred = clf_gini.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# confusion matrix
print(confusion_matrix(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,predictions))