# IMPORT LIBRARIES

In [623]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [624]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from xgboost import XGBClassifier

# DATASET

In [725]:
df=pd.read_csv('../input/adult-census-income/adult.csv')

In [726]:
df

In [727]:
df.shape

# DATA INFO

In [667]:
df.info()

As the data has ? and no nan , so we replace ? with NaN

In [668]:
df.replace("?", np.NaN, inplace = True)
df.isnull().sum()

So we see that workclass,occupation and native country has null values

In [669]:
df['income']=df['income'].map({'<=50K':0,'>50K':1})

In [670]:
df.info()

# DATA EXPLORATION

## CATEGORICAL VARIABLES

In [671]:
import matplotlib
matplotlib.rcParams['figure.figsize']=(20,10)

In [672]:
sns.countplot(x='workclass',data=df,hue='income')

In [673]:
df['workclass'].value_counts(normalize=True)*100

In [674]:
sns.countplot(x='education',data=df,hue='income')

In [675]:
df['education'].value_counts(normalize=True)*100

In [676]:
sns.countplot(x='marital.status',data=df,hue='income')

In [677]:
sns.countplot(x='relationship',data=df,hue='income')

In [678]:
sns.countplot(x='occupation',data=df,hue='income')

In [679]:
sns.countplot(x='race',data=df,hue='income')

In [680]:
sns.countplot(x='sex',data=df,hue='income')

In [681]:
sns.countplot(x='native.country',data=df,hue='income')

In [682]:
df['native.country'].value_counts(normalize=True)*100

# OBSERVATIONS

1)The private workclass is in majority and very few has income >50K

2)We see that without pay and never worked class are in minority with less than 1%.

3)Only self employed (running a business , etc) have people who earn more than 50k than those who earn leass than 50k.

4)We see that 1-4,5-6,12th,11th,9th,Preschool,7-8,10th ,HS grade,Some college,Bachelors ,Assoc-voc,Assoc-acdm have more 0's than 1's.

5)We see that marital status does not provide any info about salary as being married or not has no effect on the income(also because relationship graph shows us that there is a sort of balance of target in husband and wife , while for others there is serious imbalance)

6)In race ,white is dominant race while others are minority

7)We see that US is native country by majority while we have data for other countries is minority

## ACTIONS

1)As we see that only private is in majority and rest categories are in minority(except self employed) , so we can club all remaining cases,we can drop it as well

2)We can combine all rows of schooling with 1-4,5-6,7-8,9,10,11,12 as schooling

3)We can classify marital status as married and not married

4)We can combine non white race as Others.

5)We can combine the country as US and non-US

## CONTINUOUS VARIABLE

In [683]:
sns.histplot(x='age',data=df,hue='income')

In [684]:
# Explore Age vs Income
g = sns.FacetGrid(df, col='income')
g = g.map(sns.distplot, "age")

In [685]:
sns.histplot(x='capital.gain',data=df,hue='income')

In [686]:
df['capital.gain'].describe()

In [687]:
sns.histplot(x='capital.loss',data=df,hue='income')

In [688]:
df['capital.loss'].describe()

In [689]:
sns.histplot(x='hours.per.week',data=df,hue='income')

In [690]:
df['hours.per.week'].describe()

# OBSERVATIONS

1)We see that young people (below 25) donot earn above 50k ,while middle age people (till 60) earn above 50k and after that very few almost 0 earn above 50k

2)We see that the distribution of capital gain and loss is centred about zero and there are outliers in both

3)The distribution of hours per week is symmetric

# FEATURE ENGINEERING

In [691]:
# Convert Sex value to 0 and 1
df['sex'] = df['sex'].map({'Male': 0, 'Female':1})

# Create Married Column - Binary Yes(1) or No(0)
df['marital.status'] = df['marital.status'].replace(['Never-married','Divorced','Separated','Widowed'], 'Single')
df['marital.status'] = df['marital.status'].replace(['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse'], 'Married')
df['marital.status'] = df['marital.status'].map({'Married':1, 'Single':0})
df['marital.status'] = df['marital.status'].astype(int)

# Drop the data you don't want to use
df.drop(labels=['relationship'], axis = 1, inplace = True)

In [692]:
df.drop(['education.num'], axis = 1, inplace = True)
df['education'].replace(['11th', '9th', '7th-8th', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
                             ' School', inplace = True)

In [693]:
categorical = ['education']
label_encoder = LabelEncoder()
for col in categorical:
    label_encoder.fit(df[col])
    df[col] = label_encoder.transform(df[col])

In [694]:
df['race']=df['race'].fillna('White')
df['race'].replace(['Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'],' Others', inplace = True)
df['race'] = df['race'].map({'White':1, 'Others':0})

In [695]:
df['native.country']=df['native.country'].fillna('United-States')

In [696]:
def convert(x):
    l=len(x)
    for i in range(l):
        if (x[i]=='United-States'):
            x[i]=1
        else:
            x[i]=0

In [697]:
convert(df['native.country'])

In [698]:
df['native.country']=df['native.country'].astype('int')

In [699]:
sns.countplot(x='occupation',data=df,hue='income')

In [700]:
df['occupation']=df['occupation'].replace(['Exec-managerial','Prof-specialty','Tech-support','Protective-serv'],'highpay')
df['occupation']=df['occupation'].replace(['Machine-op-inspct','Other-service','Other-service','Adm-clerical','Craft-repair',
                          'Transport-moving','Handlers-cleaners','Sales','Farming-fishing','Tech-support',
                          'Armed-Forces','Priv-house-serv'],'lowpay')


In [701]:
df['occupation'].fillna(df['occupation'].mode()[0], inplace=True)

In [702]:
df['occupation'] = df['occupation'].map({'highpay':1, 'lowpay':0})
df['occupation'] = df['occupation'].astype(int)

In [703]:
df['race'].fillna(0,inplace=True)

In [704]:
df=df.drop('workclass',axis=1)

# Train Test Split and scaling

In [705]:
x = df.drop('income',axis=1)
y = df['income']
    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0,stratify=y)

In [706]:
scaler = StandardScaler()

x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns = x.columns)

# MODEL

## Logistic Regression

In [707]:
model_logistic = LogisticRegression(max_iter = 100000)
model_logistic.fit(x_train, y_train)
y_pred = model_logistic.predict(x_test)

In [708]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## Random  Forest (GINI)

In [709]:
cf=RandomForestClassifier(n_estimators=50,criterion='gini',random_state=0)
cf.fit(x_train,y_train)
y_pred = cf.predict(x_test)

In [710]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## Random Forest (Entropy)

In [711]:
cf=RandomForestClassifier(n_estimators=50,criterion='entropy',random_state=0)
cf.fit(x_train,y_train)
y_pred = cf.predict(x_test)

In [712]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## KNN(n=6)

In [713]:
Elbow_M = KElbowVisualizer(KMeans(), k=10,timings=False)
Elbow_M.fit(x_train)
Elbow_M.show()
plt.show()

In [714]:
cf=KNeighborsClassifier(n_neighbors=6,weights='distance')
cf.fit(x_train,y_train)

In [715]:
y_pred=cf.predict(x_test)

In [716]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## Decision Trees (GINI)

In [717]:
cf=DecisionTreeClassifier(random_state=0,criterion="gini")
cf.fit(x_train,y_train)
y_pred=cf.predict(x_test)

In [718]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## Decision Trees (Entropy)

In [719]:
cf=DecisionTreeClassifier(random_state=0,criterion="entropy")
cf.fit(x_train,y_train)
y_pred=cf.predict(x_test)

In [720]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")

## XGB Classifier

In [721]:
cf=XGBClassifier()
cf.fit(x_train,y_train)
y_pred=cf.predict(x_test)

In [722]:
print("-------------------------TEST SCORES-----------------------") 
print(f"Recall: { round(recall_score(y_test, y_pred)*100, 4) }")
print(f"Precision: { round(precision_score(y_test, y_pred)*100, 4) }")
print(f"F1-Score: { round(f1_score(y_test, y_pred)*100, 4) }")
print(f"Accuracy score: { round(accuracy_score(y_test, y_pred)*100, 4) }")
print(f"AUC Score: { round(roc_auc_score(y_test, y_pred)*100, 4) }")