In [1]:
# grv08singh@gmail.com

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [51]:
df = pd.read_csv('census-income.csv')
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [52]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Unnamed: 14'],
      dtype='object')

In [53]:
df['workclass']=df['workclass'].replace('?','Others')

In [54]:
df['occupation']=df['occupation'].replace('?','Other-service')

In [55]:
df['native-country']=df['native-country'].replace('?','United-States')

In [56]:
df = df.rename(columns={'Unnamed: 14':'Salary'})
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [57]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Salary'],
      dtype='object')

In [58]:
df_le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df_le.fit_transform(df[col])

In [59]:
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,7,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,6,83311,9,13,2,3,0,4,1,0,0,13,38,0


In [60]:
X = df.drop(columns=['Salary'])
y = df['Salary']

In [61]:
ss = StandardScaler()
X_scaled = pd.DataFrame(ss.fit_transform(X))

In [62]:
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.2, random_state=42)

In [68]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [81]:
param_dist = {
    'n_estimators':[100,150,200,250,300,350,400,450,500],
    'max_depth':[3,4,5,6,7],
    'min_samples_split':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10],
    'max_features':[0.3,0.4,0.5,0.6,0.7],
    'bootstrap':[True,False]
}

In [83]:
grid = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)

In [84]:
grid.fit(X_scaled_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [85]:
grid.best_estimator_

In [86]:
rf = RandomForestClassifier(max_depth=7, max_features=0.7, min_samples_split=7, n_estimators=350, n_jobs=-1, random_state=42)

In [87]:
rf.fit(X_scaled_train, y_train)

In [88]:
y_pred = rf.predict(X_scaled_test)

In [89]:
accuracy_score(y_test, y_pred)

0.8537371876079696

In [90]:
confusion_matrix(y_test, y_pred)

array([[18883,   893],
       [ 2917,  3356]])

In [91]:
precision_score(y_test, y_pred)

0.789832901859261

In [92]:
recall_score(y_test, y_pred)

0.5349912322652638

In [93]:
f1_score(y_test, y_pred)

0.6379015396312488

# Questions

In [29]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Salary'],
      dtype='object')

In [36]:
df[df['Salary'] == '>50K'].shape[0] / df.shape[0]

0.2409257153394597

In [35]:
df.duplicated().sum()

np.int64(0)

In [38]:
df['marital-status'].value_counts()

marital-status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [33]:
df = df.drop_duplicates()

In [39]:
df[(df['marital-status'] == 'Never-married') & (df['hours-per-week'] < 20 )].shape[0]

903

In [41]:
df['age'].min(), df['age'].max(), df['age'].median(), 

(17, 90, 37.0)

In [42]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Salary'],
      dtype='object')

In [48]:
df.groupby('native-country').count().T.sum().sort_values(ascending=True)

native-country
Holand-Netherlands                14
Scotland                         168
Hungary                          182
Honduras                         182
Outlying-US(Guam-USVI-etc)       196
Yugoslavia                       224
Laos                             252
Thailand                         252
Trinadad&Tobago                  266
Cambodia                         266
Hong                             280
Ireland                          336
Ecuador                          392
Greece                           406
France                           406
Peru                             434
Nicaragua                        476
Portugal                         518
Iran                             602
Haiti                            616
Taiwan                           714
Columbia                         826
Poland                           840
Japan                            868
Guatemala                        868
Vietnam                          938
Dominican-Republic     