In [107]:
import data_loader
import numpy as np
import pandas


# References
https://medium.com/dunder-data/selecting-subsets-of-data-in-pandas-6fcd0170be9c

https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/

https://towardsdatascience.com/handling-missing-values-in-machine-learning-part-2-222154b4b58e

https://stackoverflow.com/questions/45321406/missing-value-imputation-in-python-using-knn

https://stackoverflow.com/questions/44239269/fancyimpute-installation-in-anaconda

https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py

https://www.kaggle.com/jieyima/income-classification-model

https://medium.com/@rrfd/cleaning-and-prepping-data-with-python-for-data-science-best-practices-and-helpful-packages-af1edfbe2a3

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

# Data Loading

In [108]:
# load data and assign names
trdf, valdf = data_loader.load_train_data("data/adult.data", is_df=True)
## adding columns labels https://chartio.com/resources/tutorials/how-to-rename-columns-in-the-pandas-python-library/
trdf.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"
,"target"]
valdf.columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country"
,"target"]

## Replace '?' with np.nan
'?' in values represents missing data. So we replace it with np.nan

In [109]:
# missign values replaced as np.nan
trdf = trdf.replace(' ?',np.nan)
valdf = valdf.replace(' ?',np.nan)

## Check which features have missing values

In [110]:
trdf.isna().sum()

age                  0
workclass         1662
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1669
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     498
target               0
dtype: int64

In [111]:
df_train = trdf
total = df_train.isna().sum().sort_values(ascending=False)
percent = (df_train.isna().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pandas.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
occupation,1669,0.056933
workclass,1662,0.056695
native-country,498,0.016988
target,0,0.0
hours-per-week,0,0.0
capital-loss,0,0.0
capital-gain,0,0.0
sex,0,0.0
race,0,0.0
relationship,0,0.0


In [None]:
trdf.fillna(inplace = True, )

### Observation
We observe that only categorical features have missing values. So we will use techniques to fill missing values for categorical features. First let's assign correct data type to categorical features

## Assign correct data type to categorical values

In [136]:
catcols = [1,3,5,6,7,8,9,13,14] # list of categorical features
trdf.iloc[:,catcols] = trdf.iloc[:,catcols].astype('category')

In [137]:
trdf['workclass'].cat.categories ## here we verify that nan is not treated as a category

Int64Index([-1, 0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')

Now we will impute missing values to nan using k-nearest neighbor imputation technique described [here](https://towardsdatascience.com/handling-missing-values-in-machine-learning-part-2-222154b4b58e)

## Resolve Missing values 


### k-NN Imputation
For using kNN imputation, we will use fancyimpute library because sklearn only supports mean, median, mode impute as shown [here](https://stackoverflow.com/questions/45321406/missing-value-imputation-in-python-using-knn)

In order to install fancyimpute we execute the following commands on the terminal as shown [here](https://stackoverflow.com/questions/44239269/fancyimpute-installation-in-anaconda)

``` bash
conda install ecos  
conda install CVXcanon  
pip install fancyimpute  

```

We were getting errors in install fancyimpute and decided to use IterativeImputer from sklearn. [Here](https://scikit-learn.org/dev/auto_examples/impute/plot_missing_values.html#sphx-glr-auto-examples-impute-plot-missing-values-py) is an official example for using IterativeImputer

In [114]:
#from sklearn.experimental import enable_iterative_imputer  # noq
#from sklearn.impute import IterativeImputer

### Treat nan as a separate category

## Give category codes to each value in every feature

## Data Exploration

## Covariance of features - Principal component analysis to remove redundant features
[covariance finding](https://towardsdatascience.com/handling-missing-values-in-machine-learning-part-2-222154b4b58e)

## Analyze features with missing values

### Missing values in categorical data

# Models

## Naive Bayes - Baseline model


[Selecting subsets of data](https://medium.com/dunder-data/selecting-subsets-of-data-in-pandas-6fcd0170be9c)

TODO: missing values

In [115]:
# trdf.dropna(inplace=True)
# catcols = [1,3,5,6,7,8,9,13,14]
# trdf[catcols] = trdf[catcols].astype('category')
trdf.iloc[:,catcols] = trdf.iloc[:,catcols].apply(lambda x: x.cat.codes)
# trdf
# trdf[1].cat.codes
trdf.iloc[:,14].shape

(29315,)

In [116]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
learner = model.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])


In [117]:
catcols = [1,3,5,6,7,8,9,13,14]
valdf.iloc[:,catcols] = valdf.iloc[:,catcols].astype('category')
valdf.iloc[:,catcols] = valdf.iloc[:,catcols].apply(lambda x: x.cat.codes)
# trdf
# trdf[1].cat.codes
valdf.shape

(3246, 15)

In [118]:
learner.predict(valdf.iloc[:,0:14])

array([0, 1, 1, ..., 0, 0, 1], dtype=int8)

In [119]:
learner.score(valdf.iloc[:,0:14], valdf.iloc[:,14])

0.8009858287122612

In [120]:
auc(learner)

0.6252612717039621

## kNN


### 1 nearest

In [121]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
learner = model.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])
learner.predict(valdf.iloc[:,0:14])

array([0, 1, 1, ..., 0, 0, 1], dtype=int8)

In [122]:
learner.score(valdf.iloc[:,0:14], valdf.iloc[:,14])

0.7495378927911276

In [123]:
auc(learner)

0.6157401080977611

### 5 nearest

## ID3

### Parameterization

## NbTree
[doc](https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py)

In [124]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
learner = model.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])
learner.predict(valdf.iloc[:,0:14])
learner.score(valdf.iloc[:,0:14], valdf.iloc[:,14])



0.8342575477510783

In [125]:
auc(learner)

0.718937845116592

### Parameterization

In [126]:
# from sklearn.svm import SVC
# svc = SVC(C=1.0, kernel = "linear")
# svc.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])
# svc.predict(valdf.iloc[:,0:14])

In [127]:
# svc.score(valdf.iloc[:,0:14], valdf.iloc[:,14])

## Neural Network
### Parameterization

In [128]:
import sklearn.neural_network.multilayer_perceptron as mlp
model = mlp.MLPClassifier()
learner = model.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])

In [129]:
learner.predict(valdf.iloc[:,0:14])

array([0, 1, 1, ..., 0, 0, 1], dtype=int8)

In [130]:
learner.score(valdf.iloc[:,0:14],  valdf.iloc[:,14])

0.7843499691928527

In [131]:
auc(learner)

0.6245758904240698

## XGBoost
### Parameterization

In [132]:
import xgboost as xgb

In [133]:
model = xgb.XGBClassifier()
learner = model.fit(trdf.iloc[:,0:14], trdf.iloc[:,14])


In [134]:
def auc(learner):
    from sklearn import metrics
    learner.score(valdf.iloc[:,0:14],  valdf.iloc[:,14])
    pred = learner.predict(valdf.iloc[:,0:14])
    fpr, tpr, thresholds = metrics.roc_curve(valdf.iloc[:,14], pred)
    return metrics.auc(fpr, tpr)

In [135]:
auc(learner)

0.7546406799577817