### Gradient Boosting

In [1]:
# Import our dependencies
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
def preprocessFile(filename = Path('../Data/AdultCensusIncome.csv')):
    
    print(f'Preprocessing file {filename}')
    dtype_mapper = {'age': 'int64',
                    'workclass': 'string',
                    'fnlwgt': 'int64',
                    'education': 'string',
                    'education.num': 'int64',
                    'marital.status': 'string',
                    'occupation': 'string',
                    'relationship': 'string',
                    'race': 'string',
                    'sex': 'string',
                    'capital.gain': 'int64',
                    'capital.loss': 'int64',
                    'hours.per.week': 'int64',
                    'native.country': 'string',
                    'income': 'string'}
    df = pd.read_csv(filename, dtype=dtype_mapper, na_values='?')    
    df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income']
    df.income = df.income.apply(lambda x: '0' if x == '<=50K' else '1')
    df.income = df.income.astype('int64')
    df = df.drop(['fnlwgt',
                  'education', 
                  'capital_gain', 
                  'capital_loss',
                  'native_country'], axis=1)
    df = df.dropna()
    return df

census_df = preprocessFile()
census_df.sample(10)

Preprocessing file ..\Data\AdultCensusIncome.csv


Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income
15373,18,Private,9,Never-married,Machine-op-inspct,Not-in-family,White,Male,37,0
16435,40,Private,10,Married-civ-spouse,Sales,Husband,White,Male,60,0
9441,24,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,60,1
3432,36,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,55,0
2999,60,Local-gov,13,Separated,Prof-specialty,Unmarried,White,Female,55,0
27322,34,Private,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,52,0
27830,46,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,60,0
14064,34,Private,13,Never-married,Other-service,Not-in-family,White,Female,15,0
31978,22,Private,10,Never-married,Transport-moving,Own-child,White,Male,25,0
10295,47,Federal-gov,10,Divorced,Adm-clerical,Not-in-family,White,Female,40,0


In [5]:
y = census_df.income

X = census_df.copy()
X.drop('income', axis=1, inplace=True)

X = pd.get_dummies(X, dtype='int64')
X.head()

# Use sklearn to split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

scaler = StandardScaler()

# Create scaler instance
X_scaler = scaler.fit(X_train)

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
gb_model = GradientBoostingClassifier()
gb_model = gb_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = gb_model.predict(X_test_scaled)


In [7]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [8]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5378,409
Actual 1,796,1097


Accuracy Score : 0.8430989583333334
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      5787
           1       0.73      0.58      0.65      1893

    accuracy                           0.84      7680
   macro avg       0.80      0.75      0.77      7680
weighted avg       0.84      0.84      0.84      7680



In [9]:
# Gradient Boosting in sklearn will automatically calculate feature importance
importances = gb_model.feature_importances_
# We can sort the features by their importance
sorted(zip(gb_model.feature_importances_, X.columns), reverse=True)

[(0.47187663172279176, 'marital_status_Married-civ-spouse'),
 (0.27406141199306094, 'education_num'),
 (0.09103358990262814, 'age'),
 (0.053237202055314126, 'hours_per_week'),
 (0.028860400241496137, 'occupation_Exec-managerial'),
 (0.015323142713306577, 'occupation_Prof-specialty'),
 (0.011333574634002835, 'occupation_Other-service'),
 (0.008351262674048348, 'relationship_Wife'),
 (0.006672370512292665, 'workclass_Self-emp-not-inc'),
 (0.006600788593842233, 'occupation_Farming-fishing'),
 (0.0045269157225797385, 'workclass_Self-emp-inc'),
 (0.004393626304698473, 'sex_Female'),
 (0.003379281250160094, 'occupation_Tech-support'),
 (0.0022489882120719797, 'workclass_Local-gov'),
 (0.002058411844647315, 'relationship_Husband'),
 (0.0019294554218933925, 'occupation_Sales'),
 (0.00180749343695883, 'workclass_Federal-gov'),
 (0.0016431367409097, 'occupation_Handlers-cleaners'),
 (0.0012992115702008932, 'sex_Male'),
 (0.0011354747564795806, 'occupation_Machine-op-inspct'),
 (0.000885481554261