In [18]:
# libraries
import csv
import pandas as pd
import numpy as np
from random import shuffle
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

## 1. The Dataset: Adult Income

#### The Adult Income dataset is a classification dataset: it is used for a prediction task where the goal is to determine whether a person makes over 50k a year. The list of attributes is as follows:
- output variable: **Income**, <=50k and >50k (converted to 0 and 1 respectively)
- input features: 
    - **Age**: a continuous number
    - **WorkClass**: a categorical variable that represents different work sectors, including values such as Federal employee, Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked, Federal worker
    - **Education**: a categorical variable that represents the level of education, including values such as Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
    - **MaritalStatus**: a categorical variable with values Married-civ-spouse (Married to a civilian spouse), Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse (Married to someone in the Armed Forces) 
    - **Occupation**: a categorical variable with values Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces
    - **EducationNum**: a continuous value that represents the level of education (0 = Preschool, 16 = Doctorate)
    - **Race**: a categorical variable with values White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black 
    - **Gender**: a binary variable, includes Female and Male
    - **CapitalGain**: a continuous number
    - **CapitalLoss**: a continuous number
    - **HoursPerWeek**: a categorical variable including hour ranges of 0-30, 30-60, 60-90, 90+
    - **NativeCountry**: a categorical variable including countries such as United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands

In [23]:
df = pd.read_csv("Data/adult-train.csv")

df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "Education-Num",
        "Marital-Status", "Occupation", "Relationship", "Race", "Gender",
        "Capital-Gain", "Capital-Loss", "Hours-per-week", "NativeCountry", "Income"]

# Note for us: we removed relationship and fnlwgt variables because they didn't make sense to us, 
# so asking others to reason about them was not feasible
cols = [
        "Age", "WorkClass", "Education","Education-Num", "Marital-Status", "Occupation", "Race", "Gender",
        "Capital-Gain", "Capital-Loss", "Hours-per-week", "NativeCountry", "Income"]
df = df[cols]

train_cols = df.columns[0:-1]
label = df.columns[-1]
X_df = df[train_cols]
y_df = df[label]


#Converting the response / output variable to a binary class
y_df = y_df.apply(lambda x: 0 if x == " <=50K" else 1)

#Top 5 rows of the original dataset:
#df.head()
y_df

0        0
1        1
2        1
3        0
4        0
        ..
32555    0
32556    1
32557    1
32558    0
32559    0
Name: Income, Length: 32560, dtype: int64

In [24]:
# Use this cell for any data-related exploration

## 2. The Model: GAMs

In [25]:
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split

# # create a train/test split
# seed = 1
# X_train, X_test, y_train, y_test = train_test_split(dataset['X'],dataset['y'], test_size=0.25, random_state=seed)

# train a GAM for the training dataset
ebm = ExplainableBoostingClassifier()

ebm.fit(X_df, y_df)

ExplainableBoostingClassifier(feature_names=['Age', 'WorkClass', 'Education',
                                             'Education-Num', 'Marital-Status',
                                             'Occupation', 'Race', 'Gender',
                                             'Capital-Gain', 'Capital-Loss',
                                             'Hours-per-week', 'NativeCountry',
                                             'Marital-Status x Gender',
                                             'Marital-Status x Hours-per-week',
                                             'Age x Capital-Loss',
                                             'Education-Num x Marital-Status',
                                             'WorkClass x Race',
                                             'Education-Num x Occupation',...
                                             'WorkClass x Capital-Loss',
                                             'Marital-Status x Occupation'],
                 

In [29]:
# Test set
test_df = pd.read_csv("Data/adult-test.csv")

test_df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "Education-Num",
        "Marital-Status", "Occupation", "Relationship", "Race", "Gender",
        "Capital-Gain", "Capital-Loss", "Hours-per-week", "NativeCountry", "Income"]

test_df = test_df[cols]

input_cols = test_df.columns[0:-1]
label = test_df.columns[-1]
test_X_df = df[input_cols]
test_y_df = df[label]


#Converting the response / output variable to a binary class
test_y_df = test_y_df.apply(lambda x: 0 if x == " <=50K" else 1)

#Top 5 rows of the original dataset:
test_df.head()
test_y_df

0        0
1        1
2        1
3        0
4        0
        ..
32555    0
32556    1
32557    1
32558    0
32559    0
Name: Income, Length: 32560, dtype: int64

In [30]:
#Training accuracy
train_pred = ebm.predict(X_df).tolist()
train_pred = [0 if x<=0.5 else 1 for x in train_pred]
accuracy_train = round(sum(train_pred == y_df) / len(train_pred), 5)

#Test set accuracy
predictions = ebm.predict(test_X_df).tolist()
predictions = [0 if x <= 0.5 else 1 for x in predictions]

accuracy_test = round(sum(predictions == test_y_df) / len(predictions), 5)

print("The accuracy of the model on the training set is: ", accuracy_train)
print("The accuracy of the model on the test set is: ", accuracy_test)

#misclassified indices
indices = [index for index, value in enumerate(zip(predictions, test_y_df.tolist())) if value[0] != value[1]]

The accuracy of the model on the training set is:  0.88001
The accuracy of the model on the test set is:  0.88001


### Global feature importance plot and individual feature partial dependence plots (PDP)

In [31]:
from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


### Individual Predictions where Predicted and Actual values are both 0

In [36]:
pred_0 = [index for index, value in enumerate(zip(predictions, test_y_df.tolist())) if value[0] == 0 and value[1] == 0]

In [37]:
# To find a good what-if question
ebm_local = ebm.explain_local(test_X_df.iloc[pred_0], test_y_df.iloc[pred_0], name = 'EBM')
show(ebm_local)

#### Local misclassification

In [38]:
ebm_local = ebm.explain_local(test_X_df.iloc[indices], test_y_df.iloc[indices], name = 'EBM')
show(ebm_local)

In [22]:
ebm_local = ebm.explain_local(test_X_df[7:8], test_y_df[7:8], name='EBM')
show(ebm_local)

In [114]:
test_X_df.iloc[pred_0[21]]

Age                               30
WorkClass                Federal-gov
Education               Some-college
Education-Num                     10
Marital-Status    Married-civ-spouse
Occupation              Adm-clerical
Race                           White
Gender                          Male
Capital-Gain                       0
Capital-Loss                       0
Hours-per-week                    40
NativeCountry          United-States
Name: 7864, dtype: object

In [115]:
#Original Datapoint
index = pred_0[21]
orig = ebm.explain_local(test_X_df[index:index+1], test_y_df[index:index+1], name='EBM')
show(orig)

In [134]:
# Updated datapoint
test = test_X_df.iloc[index]
test_y = test_y_df.iloc[index]
# test["Marital-Status"] = "Married-civ-spouse" 
# test["Capital-Gain"] = 5000
# test["Education"] = "Bachelors" 
# test["Race"] = "White" #PIb
# test["Age"] = 40
# test["Capital-Loss"] = 5000
# test["Hours-per-week"] = 55
# test["WorkClass"] = "Private"
# test["Occupation"] = "Exec-managerial"
test = test.to_frame().T
new_y = ebm.predict(test)
print(new_y)
new = ebm.explain_local(test, pd.Series(new_y), name='EBM')
show(new)

[0]




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

