# PART 1 INTRODUCTION TO DATA:

In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("adult.csv", na_values = ["#NAME?"])
# na_vlaues = [#NAME?] => while reading the CSV file,
# Pandas will treat any occurrences of '#NAME?' as missing (NaN) values in the DataFrame.

In [2]:
print(df.head(5)) # prints first 5 rows

    age         workclass    fnlwgt  education  education_num  \
0  39.0         State-gov   77516.0  Bachelors           13.0   
1  50.0  Self-emp-not-inc   83311.0  Bachelors           13.0   
2  38.0           Private  215646.0    HS-grad            9.0   
3  53.0           Private  234721.0       11th            7.0   
4  28.0           Private  338409.0  Bachelors           13.0   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black     NaN   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country income  
0          2174             0              40  United-States  <=50K  
1             0         

In [3]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,4952.0,4893.0,4943.0,5000.0,5000.0,5000.0
mean,38.58643,190975.2,10.080316,1033.6402,93.6968,40.519
std,13.582256,106574.7,2.535268,7051.802077,410.801418,12.109193
min,17.0,19302.0,1.0,0.0,0.0,1.0
25%,28.0,117747.0,9.0,0.0,0.0,40.0
50%,37.0,179533.0,10.0,0.0,0.0,40.0
75%,47.0,241895.0,12.0,0.0,0.0,45.0
max,90.0,1033222.0,16.0,99999.0,2547.0,99.0


In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,,0,0,40,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43.0,Private,222971.0,5th-6th,3.0,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,Mexico,<=50K
4996,31.0,Private,259425.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4997,47.0,Self-emp-inc,212120.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4998,,Private,245880.0,HS-grad,9.0,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,60,United-States,<=50K


In [5]:
df["income"]

0       <=50K
1       <=50K
2       <=50K
3       <=50K
4       <=50K
        ...  
4995    <=50K
4996     >50K
4997     >50K
4998    <=50K
4999    <=50K
Name: income, Length: 5000, dtype: object

In [6]:
df["income"].value_counts() # to know how many member are above 50K

<=50K    3779
>50K     1221
Name: income, dtype: int64

## It then converts the 'income' column's string labels ('<=50K' and '>50K') into numerical values (0 and 1) using a mapping dictionary, updating the DataFrame accordingly.

In [7]:
income_mapping = {
    "<=50K" : 0,
    ">50K" : 1
}
df["income"] = df["income"].map(income_mapping)

# this can be also written as
#  df["income"] = [0 if x == "<=50K" else 1 for x in df["income"]]

In [8]:
df["income"]

0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    1
4997    1
4998    0
4999    0
Name: income, Length: 5000, dtype: int64

## Assign X as a DataFrame of features and y as a Series of the outcome variable

In [9]:
# features => columns => independent variable
# outcomes => output to be predicted =. dependent variable
X = df.drop("income",axis = 1) # 1 represent the axis
# axis => Whether to drop labels from the index (0 / ‘index’) or columns (1 / ‘columns’)
y = df["income"]

In [10]:
print(X.head(5))

    age         workclass    fnlwgt  education  education_num  \
0  39.0         State-gov   77516.0  Bachelors           13.0   
1  50.0  Self-emp-not-inc   83311.0  Bachelors           13.0   
2  38.0           Private  215646.0    HS-grad            9.0   
3  53.0           Private  234721.0       11th            7.0   
4  28.0           Private  338409.0  Bachelors           13.0   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black     NaN   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country  
0          2174             0              40  United-States  
1             0             0         

In [11]:
print(y.head(5))

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64


# PART 2 BASIC DATA CLEANING

In [12]:
df["education"].head(5)

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: object

In [13]:
# these are catogarial features so we need to mention them in numerical feature to build  a model
# catogorical and ordinal feature should be converted as numerical feature 
pd.get_dummies(X["education"])

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,?,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [14]:
X["education"].head(5)

0    Bachelors
1    Bachelors
2      HS-grad
3         11th
4    Bachelors
Name: education, dtype: object

In [15]:
for col_name in X.columns:
    if X[col_name].dtypes == "object":
        num = len(X[col_name].unique())
        print(f"{col_name} has {num} catagories")


workclass has 8 catagories
education has 17 catagories
marital_status has 7 catagories
occupation has 15 catagories
relationship has 6 catagories
race has 6 catagories
sex has 3 catagories
native_country has 40 catagories


In [16]:
# native country has more catagories

In [17]:
X["native_country"].value_counts().sort_values(ascending = False).head(10)

United-States    4465
Mexico            104
?                  97
Canada             28
Philippines        22
Germany            22
El-Salvador        16
Puerto-Rico        16
England            16
China              15
Name: native_country, dtype: int64

In [19]:

X["native_country"]  = ["United-States" if x == "United-States" else "Other" for x in X["native_country"]]
X["native_country"].value_counts().sort_values(ascending = False).head(10)

United-States    4465
Other             535
Name: native_country, dtype: int64