# Exploratory Data Analysis, KNN & Decision Tree Models

* Cleaning data by renaming columns, removing unnecessary data, and making sure there is no damaged data (special characters)
* Applying one-hot-encoding to the categorical features of the data set
* Implementing feature engineering to extrapolate data from existing relationships
* Building models for KNN and Decision Trees to predict for the label

In [16]:
# All import statements
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt

In [17]:
# Import data from csv files

train = pd.read_csv(r'~/desktop/COMP551/COMP551-p1/Adult/adult.csv')
testDir = r'~/desktop/COMP551/COMP551-p1/Adult/adult-test.csv'

In [18]:
# Columns from description of data set
columns = ['Age', 'Workclass', 'Education Level', 'Education Years',
       'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital Gain', 'Capital Loss','[To Be Dropped]', 'Hours Per Week', 'Native Country',
       ' <=50K']
test = pd.read_csv(testDir, skiprows=1, names=columns, sep=', ', engine='python')
train

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Note 0 - Data Cleaning

In [19]:
# Observe columns
train.columns

Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')

In [20]:
# Renaming columns to appropriate names from the dataset description
train.rename(columns = {" United-States": "Native Country",
                        " 40": "Hours Per Week",
                        " 0": "Capital Loss",
                        " 2174": "Capital Gain",
                        " Male": "Sex",
                        " White": "Race",
                        " Not-in-family": "Relationship",
                        " Adm-clerical": "Occupation",
                        " Never-married": "Marital Status",
                        " 13": "Education Years",
                        " Bachelors": "Education Level",
                        " State-gov": "Workclass",
                        "39": "Age"}, inplace=True)

In [21]:
# Removing this feature because data is incomprehensible
train.drop(" 77516", axis=1, inplace=True)

test.drop("Will Drop You", axis=1, inplace=True)

KeyError: "['Will Drop You'] not found in axis"

In [None]:
train.columns

In [None]:
# Transpose of training data statistics for easier viewability
print("Training data shape: ", train.shape)
train.describe().T

In [None]:
# Understanding numerical and categorical feature counts
num_data = [c for c in train.columns if train[c].dtype !='O']
cat_data = [w for w in train.columns if train[w].dtype == 'O']

print("Numerical Feature count: ", len(num_data))
num_data

In [None]:
print("Categorical Feature count: ", len(cat_data))
cat_data

In [None]:
# null values in data
nullCounter = train.isna().sum().sort_values(ascending=False)/len(train)

nullCounter

In [None]:
# null values in data
nullCounter = test.isna().sum().sort_values(ascending=False)/len(test)

nullCounter

In [None]:
train

In [None]:
zeroCount1 = 0
zeroCount2 = 0
for x, y in zip(train["Capital Loss"],train["Capital Gain"]):
    if x == 0:
        zeroCount1 += 1
    if y == 0:
        zeroCount2 += 1

print(zeroCount1, zeroCount2)

In [None]:
train.drop("Capital Loss", axis=1, inplace=True)
test.drop("Capital Loss", axis=1, inplace=True)

In [None]:
train.drop("Capital Gain", axis=1, inplace=True)
test.drop("Capital Gain", axis=1, inplace=True)

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.info()

In [None]:
# Now the data below does not have any null values nor does it have irrelevant or incomprehensible features
train

In [None]:
# The next thing is to look for special characters in the data which may indicate damaged data
train.isin([' ?']).sum()

In [None]:
test.isin(['?']).sum()

In [None]:
test.drop(test.index[test['Workclass'] == '?'], inplace = True)
test.drop(test.index[test['Relationship'] == '?'], inplace = True)
test.drop(test.index[test['Native Country'] == '?'], inplace = True)

In [None]:
# The occupation column as a significant "?"-value count
train.drop(train.index[train['Occupation'] == ' ?'], inplace = True)

In [None]:
# The native country column also has these
train.drop(train.index[train['Native Country'] == ' ?'], inplace = True)

In [None]:
# The next thing is to look for special characters in the data which may indicate damaged data
train.isin([' ?']).sum()

In [None]:
test.head(10)

In [None]:
test.columns

## Note 1 - One hot encoding

Now, the training data set has been cleaned relatively well. The next step is to include some feature engineering and make sure the test data is in the same format as the cleaned training data set to ensure the same input space compatibility. However, before moving on to the feature engineering, we must apply one-hot-encoding to the categorical data in the data set.

In [None]:
# Re-listing categorical features to one hot encode
cat_data

In [None]:
train = train.rename(columns={' <=50K': 'incomeGT50'})

In [None]:
test = test.rename(columns={' <=50K': 'incomeGT50'})

In [None]:
inter_train = train[['Workclass',
 'Education Level',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native Country',
 'incomeGT50']]

inter_test = test[['Workclass',
 'Education Level',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native Country',
 'incomeGT50']]

inter_test

In [None]:
train_ohe = pd.get_dummies(inter_train)
test_ohe = pd.get_dummies(inter_test)
test_ohe.shape

In [None]:
train.head()

In [None]:
# Now we can generate the data set to feed to the knn model (purely numerical data set)
train_original_num = train[['Age', 'Education Years', 'Hours Per Week']]
frames = [train_ohe,train_original_num]
_train_numerical = pd.concat(frames, axis=1)

test_original_num = test[['Age', 'Education Years', 'Hours Per Week']]
frames2 = [test_ohe,test_original_num]
_test_numerical = pd.concat(frames2, axis=1)

# Shapes of the final, cleaned data sets
print(_train_numerical.shape, test.shape)

print(_test_numerical.columns)

In [None]:
df.groupby('Employee')['Age'].apply(lambda group_series: group_series.tolist()).reset_index()

## Note 2 - Feature Engineering

In [None]:
# Distributions of values in categorical features
for col in train.columns:
    plt.figure()
    train.groupby(by=col).size().plot.bar()

In [None]:
for col in train.columns:
    print ("---- %s ---" % col)
    print (train[col].value_counts())

In [None]:
train['IncomeBin'] = train['incomeGT50'].map({' <=50K': 0, ' >50K': 1}).astype(int)

In [None]:
train.groupby('Age').IncomeBin.mean().plot(kind='bar', figsize=(16,14))

In [None]:
train.groupby('Education Years').IncomeBin.mean().plot(kind='bar', figsize=(8,6))

In [None]:
train.groupby('Hours Per Week').IncomeBin.mean().plot(kind='bar', figsize=(16,14))

## Note 3 - KNN Classifier

In [None]:
# Hyper-parameter K
K = 5

Xtrain_df = _train_numerical.drop(['incomeGT50_ <=50K', 'incomeGT50_ >50K'], 1)
X = np.array(Xtrain_df)

#Xtest_df = test.drop(['<=50K'], 1)
#Xtest = np.array(Xtest_df)

Ytrain_df = _train_numerical[['incomeGT50_ <=50K', 'incomeGT50_ >50K']]
Y = np.array(Ytrain_df)

#Y_test = np.array(test)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)

In [None]:
clf = KNeighborsClassifier(n_neighbors=K)
clf.fit(X_train, y_train)

In [None]:
print("K = ", K)

# Test on model on the test dataset
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_pred, y_test)
print("KNN accuracy:", acc_score)

## Note 4 Decision Tree Classifier

In [None]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)

In [None]:
# Test on model on the test dataset
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_pred, y_test)
print("Decision Tree accuracy:", acc_score)

# Determine our model's score
score = clf.score(X_test, y_test)
print(score)