In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

In [43]:
def preprocessFile(filename = Path('../Data/AdultCensusIncome.csv')):
    
    print(f'Preprocessing file {filename}')
    dtype_mapper = {'age': 'int64',
                    'workclass': 'string',
                    'fnlwgt': 'int64',
                    'education': 'string',
                    'education.num': 'int64',
                    'marital.status': 'string',
                    'occupation': 'string',
                    'relationship': 'string',
                    'race': 'string',
                    'sex': 'string',
                    'capital.gain': 'int64',
                    'capital.loss': 'int64',
                    'hours.per.week': 'int64',
                    'native.country': 'string',
                    'income': 'string'}
    df = pd.read_csv(filename, dtype=dtype_mapper, na_values='?')    
    df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income']
    df = df.drop(['fnlwgt',
                  'education', 
                #   'capital_gain', 
                #   'capital_loss',
                  'native_country'], axis=1)
    df = df.dropna()
    return df

census_df = preprocessFile()
census_df.sample(10)


Preprocessing file ..\Data\AdultCensusIncome.csv


Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
15379,81,Private,14,Widowed,Prof-specialty,Unmarried,White,Male,0,0,60,<=50K
32005,40,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,<=50K
16491,23,Private,9,Never-married,Craft-repair,Unmarried,White,Male,0,0,40,<=50K
26391,33,Private,9,Never-married,Craft-repair,Own-child,Asian-Pac-Islander,Male,0,0,40,<=50K
30564,41,State-gov,14,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,35,<=50K
18505,39,Local-gov,13,Separated,Prof-specialty,Not-in-family,Black,Male,0,0,30,<=50K
32261,31,Private,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,35,<=50K
32019,62,Self-emp-not-inc,4,Widowed,Farming-fishing,Other-relative,White,Female,0,0,35,<=50K
28621,44,Private,10,Widowed,Sales,Not-in-family,White,Female,0,0,25,<=50K
19731,43,Private,9,Separated,Transport-moving,Not-in-family,White,Male,0,0,40,<=50K


In [17]:
census_df.shape

(30718, 10)

In [27]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30718 entries, 1 to 32560
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30718 non-null  int64 
 1   workclass       30718 non-null  string
 2   education_num   30718 non-null  int64 
 3   marital_status  30718 non-null  string
 4   occupation      30718 non-null  string
 5   relationship    30718 non-null  string
 6   race            30718 non-null  string
 7   sex             30718 non-null  string
 8   hours_per_week  30718 non-null  int64 
 9   income          30718 non-null  string
dtypes: int64(3), string(7)
memory usage: 2.6 MB


In [18]:
for col in census_df:
    print('-' * 20)
    print(census_df[col].value_counts())
    print('-' * 20)
    

--------------------
age
36    875
31    869
34    860
37    849
33    849
     ... 
82      7
83      5
88      3
85      3
86      1
Name: count, Length: 72, dtype: int64
--------------------
--------------------
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Name: count, dtype: Int64
--------------------
--------------------
education_num
9     9968
10    6775
13    5182
14    1675
11    1321
7     1056
12    1020
6      831
4      573
15     558
5      463
16     398
8      393
3      303
2      156
1       46
Name: count, dtype: int64
--------------------
--------------------
marital_status
Married-civ-spouse       14339
Never-married             9912
Divorced                  4258
Separated                  959
Widowed                    840
Married-spouse-absent      389
Married-AF-spouse           21
Name: count, dtype: Int64
---------

### Separate the features X from the target y

In [44]:
y = census_df.income

X = census_df.copy()
X.drop('income', axis=1, inplace=True)


In [45]:
X = pd.get_dummies(X, dtype='int64')
X.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
1,82,9,0,4356,18,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
3,54,4,0,3900,40,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
4,41,10,0,3900,40,0,0,1,0,0,...,1,0,0,0,0,0,0,1,1,0
5,34,9,0,3770,45,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
6,38,6,0,3770,40,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1


### Separate the data into training and testing subsets.

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### Scale the data using `StandardScaler`

In [47]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler =scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [48]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 3 
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training data
knn = knn.fit(X_train_scaled, y_train)

In [33]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(23038, 44) (7680, 44)
(23038,) (7680,)


In [None]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)
y_pred

### Random Forest Model

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=700, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [52]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [54]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5301,486
Actual 1,716,1177


Accuracy Score : 0.8434895833333333
Classification Report
              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      5787
        >50K       0.71      0.62      0.66      1893

    accuracy                           0.84      7680
   macro avg       0.79      0.77      0.78      7680
weighted avg       0.84      0.84      0.84      7680



In [42]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.29044283296602547, 'age'),
 (0.16058023384104497, 'education_num'),
 (0.14050933217266962, 'hours_per_week'),
 (0.07510300982229313, 'marital_status_Married-civ-spouse'),
 (0.0457306856844235, 'relationship_Husband'),
 (0.02881879352887919, 'marital_status_Never-married'),
 (0.023350981591108756, 'occupation_Exec-managerial'),
 (0.020943470058294507, 'occupation_Prof-specialty'),
 (0.012495619836575614, 'workclass_Private'),
 (0.011950348924270524, 'relationship_Wife'),
 (0.011435737996522786, 'relationship_Not-in-family'),
 (0.010605462049171517, 'workclass_Self-emp-not-inc'),
 (0.010389705231624375, 'relationship_Own-child'),
 (0.010202008039903872, 'occupation_Other-service'),
 (0.009912837185837964, 'sex_Male'),
 (0.009803225308190371, 'sex_Female'),
 (0.008768126707006507, 'race_White'),
 (0.00861639389405347, 'occupation_Sales'),
 (0.008591101339085047, 'occupation_Craft-repair'),
 (0.008038731466725872, 'workclass_Self-emp-inc'),
 (0.007262866933839938, 'marital_status_Divor