In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree

In [105]:
file_url = "https://media.githubusercontent.com/media/musthave-ML10/data_source/main/salary.csv"
df = pd.read_csv(file_url, skipinitialspace = True)

In [106]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   education       48842 non-null  object
 3   education-num   48842 non-null  int64 
 4   marital-status  48842 non-null  object
 5   occupation      46033 non-null  object
 6   relationship    48842 non-null  object
 7   race            48842 non-null  object
 8   sex             48842 non-null  object
 9   capital-gain    48842 non-null  int64 
 10  capital-loss    48842 non-null  int64 
 11  hours-per-week  48842 non-null  int64 
 12  native-country  47985 non-null  object
 13  class           48842 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB


In [111]:
df.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,10.078089,1079.067626,87.502314,40.422382
std,13.71051,2.570973,7452.019058,403.004552,12.391444
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [113]:
df['class'].unique()

array(['<=50K', '>50K'], dtype=object)

In [115]:
df['class'] = df['class'].map({"<=50K" : 0, ">50K" : 1})

In [117]:
df.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class              int64
dtype: object

In [119]:
obj_list = [i for i in df.columns if df[i].dtype == "object"]

In [121]:
obj_list

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [123]:
for i in obj_list:
    print(i, df[i].nunique())

workclass 8
education 16
marital-status 7
occupation 14
relationship 6
race 5
sex 2
native-country 41


In [125]:
df['education'].value_counts()

education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

In [127]:
np.sort(df['education-num'].unique())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int64)

In [129]:
df.drop("education", axis = 1, inplace = True)

In [131]:
df["occupation"].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [133]:
df[df['occupation'].isnull()]

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
4,18,,10,Never-married,,Own-child,White,Female,0,0,30,United-States,0
6,29,,9,Never-married,,Unmarried,Black,Male,0,0,40,United-States,0
13,58,,9,Married-civ-spouse,,Husband,White,Male,0,0,35,United-States,0
22,72,,4,Divorced,,Not-in-family,White,Female,0,0,6,United-States,0
35,65,,9,Married-civ-spouse,,Husband,White,Male,0,0,40,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48811,35,,13,Married-civ-spouse,,Wife,White,Female,0,0,55,United-States,1
48812,30,,13,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,0
48820,71,,16,Married-civ-spouse,,Husband,White,Male,0,0,10,United-States,1
48822,41,,9,Separated,,Not-in-family,Black,Female,0,0,32,United-States,0


In [135]:
df.groupby("native-country").mean(numeric_only = True).sort_values('class')

Unnamed: 0_level_0,age,education-num,capital-gain,capital-loss,hours-per-week,class
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Holand-Netherlands,32.0,10.0,0.0,2205.0,40.0,0.0
Guatemala,32.090909,6.306818,167.875,18.113636,38.715909,0.034091
Outlying-US(Guam-USVI-etc),38.826087,10.043478,0.0,76.608696,41.347826,0.043478
Columbia,39.458824,9.258824,125.364706,65.247059,39.929412,0.047059
Dominican-Republic,37.970874,7.320388,1064.456311,39.029126,41.621359,0.048544
Mexico,33.635121,6.026288,415.954784,32.656151,40.21346,0.049422
Nicaragua,36.285714,9.0,138.653061,69.938776,36.938776,0.061224
El-Salvador,33.380645,6.722581,392.76129,36.367742,36.36129,0.070968
Trinadad&Tobago,39.259259,8.962963,116.185185,156.518519,38.888889,0.074074
Vietnam,34.616279,9.616279,604.802326,86.372093,37.976744,0.081395


In [137]:
country_group = df.groupby("native-country")['class'].mean()

In [139]:
country_group = country_group.reset_index()

In [141]:
country_group

Unnamed: 0,native-country,class
0,Cambodia,0.321429
1,Canada,0.346154
2,China,0.295082
3,Columbia,0.047059
4,Cuba,0.246377
5,Dominican-Republic,0.048544
6,Ecuador,0.133333
7,El-Salvador,0.070968
8,England,0.370079
9,France,0.421053


In [143]:
df = df.merge(country_group, on = 'native-country', how = 'left')

In [145]:
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class_x,class_y
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0,0.243977
1,38,Private,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0,0.243977
2,28,Local-gov,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1,0.243977
3,44,Private,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1,0.243977
4,18,,10,Never-married,,Own-child,White,Female,0,0,30,United-States,0,0.243977


In [147]:
df.drop('native-country', axis = 1, inplace = True)

In [149]:
df = df.rename(columns = {"class_x":"class", "class_y" : "native-country"})

In [151]:
df.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,class,native-country
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,0,0.243977
1,38,Private,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,0,0.243977
2,28,Local-gov,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,1,0.243977
3,44,Private,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,1,0.243977
4,18,,10,Never-married,,Own-child,White,Female,0,0,30,0,0.243977


In [153]:
df.isnull().mean()

age               0.000000
workclass         0.057307
education-num     0.000000
marital-status    0.000000
occupation        0.057512
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
class             0.000000
native-country    0.017546
dtype: float64

In [155]:
df["native-country"] = df["native-country"].fillna(-99)

In [157]:
df['workclass'].value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [159]:
df['workclass'] = df['workclass'].fillna('Private')

In [161]:
df['occupation'].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [163]:
df['occupation'] = df['occupation'].fillna("Unknown")

In [165]:
df = pd.get_dummies(df, drop_first = True, dtype='int')

In [167]:
df.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,class,native-country,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,25,7,0,0,40,0,0.243977,0,0,1,...,0,0,1,0,0,0,1,0,0,1
1,38,9,0,0,50,0,0.243977,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,28,12,0,0,40,1,0.243977,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,44,10,7688,0,40,1,0.243977,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,18,10,0,0,30,0,0.243977,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [193]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("class", axis = 1), df['class'], test_size = 0.25, random_state = 12)

In [205]:
model = DecisionTreeClassifier()

In [207]:
model.fit(x_train, y_train)

In [209]:
pred = model.predict(x_test)

In [211]:
accuracy_score(y_test, pred)

0.8192613217590697

In [213]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)
print("Train_score:", accuracy_score(y_train, train_pred), "Test_score:", accuracy_score(y_test, test_pred))

Train_score: 0.9753760476099479 Test_score: 0.8203259356318073
