### Importing Libraries and Data Frames

In [56]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle

In [57]:
# Load the dataframe from a csv file
df = pd.read_csv('CensusAdultIncome.csv')


### Exploring the Data Frames

In [58]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


- workclass: A categorical feature representing the type of income, such as private, self-employment, and government employment. Some missing values present.
- fnlwgt: An integer feature with no description provided. No missing values.
- education: A categorical feature representing the level of education 
- education-num: An integer feature representing the numerical encoding of **education** level.
- occupation: A categorical feature representing the type of occupation, such as managerial, technical, and service-related occupations. Some missing values present.
- native-country: A categorical feature representing the country of origin, including various countries such as the United States, Canada, and India. Some missing values present.
- income: The target variable, a binary feature representing income level, with categories >50K and <=50K. No missing values.

####  Cleaning the target .

In [60]:
#Target value has values with dots, therefore 4 catagories
df['income'] = df['income'].str.replace('.', '')

# Replace the values in the target column with string '0' and '1'.
df['income'] = df['income'].str.replace('<=50K', '0')  
df['income'] = df['income'].str.replace('>50K', '1')   

# Convert to integer
df['income'] = df['income'].astype(int)

  df['income'] = df['income'].str.replace('.', '')


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  int32 
dtypes: int32(1), int64(6), object(8)
memory usage: 5.4+ MB


Print count and percentage of classes variable

In [62]:
# Calculate the counts of unique values in the 'class' column 
class_counts = df['income'].value_counts()

# Calculate the percentage of each unique value in the 'class' column by dividing 'class_counts' by its sum and then multiplying by 100.
class_percentages = class_counts / class_counts.sum() * 100

print('Class counts:\n' ,class_counts, '\n')
print('Percentage of each class: \n' ,class_percentages)
print('\nTotal number of rows: ', df.shape[0])

# Saving this for future use
a = df.shape[0]

Class counts:
 0    37155
1    11687
Name: income, dtype: int64 

Percentage of each class: 
 0    76.071823
1    23.928177
Name: income, dtype: float64

Total number of rows:  48842


#### Cleaning features dataframe

In [63]:
#Some values are like '?'. Replacing them as NaN 
df[df == '?'] = np.nan

In [64]:
# Dropping the rows with NaN values in  'workclass', 'occupation', 'native-country' for the test dataset
df.dropna(subset=['workclass', 'occupation', 'native-country'], inplace=True)
df.dropna( inplace=True)

Checking wheter Education and Education-number are similar.

In [65]:
print(df['education'].value_counts().head())
print()
print(df['education-num'].value_counts().head())

HS-grad         14783
Some-college     9899
Bachelors        7570
Masters          2514
Assoc-voc        1959
Name: education, dtype: int64

9     14783
10     9899
13     7570
14     2514
11     1959
Name: education-num, dtype: int64


In [66]:
# From X set, education column is dropped as it is same with Education-num, which is already in numerical form.
df = df.drop(['education'], axis=1)

In [67]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


# Count and percentage of Target classes values after droping NaN values 


In [68]:
# Calculate the counts of unique values in the 'class' column of 'df_class_feature' and store it in 'class_counts'.
class_counts = df['income'].value_counts()

# Calculate the percentage of each unique value in the 'class' column by dividing 'class_counts' by its sum and then multiplying by 100.
class_percentages = class_counts / class_counts.sum() * 100

print('Class counts:\n' ,class_counts, '\n')
print('Percentage of each class: \n' ,class_percentages)

b = df.shape[0]
print('\nNumber of rows after dropping NaN: ', b)
print('number of rows dropped: ', a-b)

Class counts:
 0    34014
1    11208
Name: income, dtype: int64 

Percentage of each class: 
 0    75.215603
1    24.784397
Name: income, dtype: float64

Number of rows after dropping NaN:  45222
number of rows dropped:  3620


### Creating X and y sets

In [69]:
# From X set, education column is also dropped as it is similar to Education-num
X = df.drop(['income'], axis=1)

y = df['income']

In [70]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             45222 non-null  int64 
 1   workclass       45222 non-null  object
 2   fnlwgt          45222 non-null  int64 
 3   education-num   45222 non-null  int64 
 4   marital-status  45222 non-null  object
 5   occupation      45222 non-null  object
 6   relationship    45222 non-null  object
 7   race            45222 non-null  object
 8   sex             45222 non-null  object
 9   capital-gain    45222 non-null  int64 
 10  capital-loss    45222 non-null  int64 
 11  hours-per-week  45222 non-null  int64 
 12  native-country  45222 non-null  object
dtypes: int64(6), object(7)
memory usage: 4.8+ MB


In [71]:
y.value_counts()

0    34014
1    11208
Name: income, dtype: int64

### Split data into separate fitting and test set

In [72]:
# Split the dataset into fitting data (70%) and test set (30%)
X_fit, X_test, y_fit, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [73]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36177 entries, 4727 to 35636
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             36177 non-null  int64 
 1   workclass       36177 non-null  object
 2   fnlwgt          36177 non-null  int64 
 3   education-num   36177 non-null  int64 
 4   marital-status  36177 non-null  object
 5   occupation      36177 non-null  object
 6   relationship    36177 non-null  object
 7   race            36177 non-null  object
 8   sex             36177 non-null  object
 9   capital-gain    36177 non-null  int64 
 10  capital-loss    36177 non-null  int64 
 11  hours-per-week  36177 non-null  int64 
 12  native-country  36177 non-null  object
dtypes: int64(6), object(7)
memory usage: 3.9+ MB


#### The percentage of each class in the target variable for each set

In [74]:
# Defined a function to calculate  and print the percentage of each class in the target variable
def calculate_class_percentage(y):
    class_percentage = {}
    total_samples = len(y)
    unique_classes = set(y)
    
    for cls in unique_classes:
        class_count = sum(y == cls)
        percentage = (class_count / total_samples) * 100
        class_percentage[cls] = percentage
    
    return class_percentage

In [75]:
# Calculate class percentages for each dataset
fit_class_percentage = calculate_class_percentage(y_fit)
test_class_percentage = calculate_class_percentage(y_test)

# Print class percentages for each dataset
print("Fit set class percentages:")
print(fit_class_percentage )
print("\nTest set class percentages:")
print(test_class_percentage)
print('\n Number of rows of X fit', X_fit.shape[0], '\n Number of rows of X test', X_test.shape[0],'\n Number of rows of Y fit', y_fit.shape[0],'\n Number of rows of y test', y_test.shape[0],)


Fit set class percentages:
{0: 75.26328882991956, 1: 24.73671117008044}

Test set class percentages:
{0: 75.02487562189056, 1: 24.975124378109452}

 Number of rows of X fit 36177 
 Number of rows of X test 9045 
 Number of rows of Y fit 36177 
 Number of rows of y test 9045


In [76]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36177 entries, 4727 to 35636
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             36177 non-null  int64 
 1   workclass       36177 non-null  object
 2   fnlwgt          36177 non-null  int64 
 3   education-num   36177 non-null  int64 
 4   marital-status  36177 non-null  object
 5   occupation      36177 non-null  object
 6   relationship    36177 non-null  object
 7   race            36177 non-null  object
 8   sex             36177 non-null  object
 9   capital-gain    36177 non-null  int64 
 10  capital-loss    36177 non-null  int64 
 11  hours-per-week  36177 non-null  int64 
 12  native-country  36177 non-null  object
dtypes: int64(6), object(7)
memory usage: 3.9+ MB


In [77]:
# Label encoding for ordinal categorical variables. First fitting and transforming on fit set and then transforming on test set
LE = LabelEncoder()

X_fit.iloc[:, 1] = LE.fit_transform(X_fit.iloc[:, 1]) # Workclass column
X_test.iloc[:, 1] = LE.transform(X_test.iloc[:, 1])


  X_fit.iloc[:, 1] = LE.fit_transform(X_fit.iloc[:, 1]) # Workclass column
  X_test.iloc[:, 1] = LE.transform(X_test.iloc[:, 1])


### Define One-hot-encoding function for the chosen variables, or dummification 

In [78]:
def one_hot_encode_columns(X_fit, X_test, columns_to_one_hot_encode):
    # Create a OneHotEncoder instance
    one_hot_encoder = OneHotEncoder(sparse=False)
    
    # Apply one-hot encoding to the training dataset
    one_hot_encoded_columns_train = one_hot_encoder.fit_transform(X_fit[columns_to_one_hot_encode])
    # Apply the same transformation to the test dataset
    one_hot_encoded_columns_test = one_hot_encoder.transform(X_test[columns_to_one_hot_encode])

    # Convert the one-hot encoded columns to a DataFrame
    one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_columns_train, columns=one_hot_encoder.get_feature_names_out(columns_to_one_hot_encode))
    # Convert the one-hot encoded columns to a DataFrame
    one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_columns_test, columns=one_hot_encoder.get_feature_names_out(columns_to_one_hot_encode))

    # Drop the original columns from X_train
    X_fit.drop(columns_to_one_hot_encode, axis=1, inplace=True)
    # Drop the original columns from X_test
    X_test.drop(columns_to_one_hot_encode, axis=1, inplace=True)

    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    X_fit = pd.concat([X_fit.reset_index(drop=True), one_hot_encoded_df_train.reset_index(drop=True)], axis=1)
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    X_test = pd.concat([X_test.reset_index(drop=True), one_hot_encoded_df_test.reset_index(drop=True)], axis=1)
    
    return X_fit, X_test


In [79]:
# Identify columns that are not ordinal and need one-hot encoding
columns_to_one_hot_encode = ['marital-status', 'occupation',  'relationship', 'race', 'sex', 'native-country'] # List of non-ordinal columns

# Apply one-hot encoding to the fit and test datasets
X_fit, X_test = one_hot_encode_columns(X_fit, X_test, columns_to_one_hot_encode)



In [80]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36177 entries, 0 to 36176
Data columns (total 82 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   age                                        36177 non-null  int64  
 1   workclass                                  36177 non-null  int32  
 2   fnlwgt                                     36177 non-null  int64  
 3   education-num                              36177 non-null  int64  
 4   capital-gain                               36177 non-null  int64  
 5   capital-loss                               36177 non-null  int64  
 6   hours-per-week                             36177 non-null  int64  
 7   marital-status_Divorced                    36177 non-null  float64
 8   marital-status_Married-AF-spouse           36177 non-null  float64
 9   marital-status_Married-civ-spouse          36177 non-null  float64
 10  marital-status_Married

In [81]:
# Calculate class percentages for each dataset
fit_class_percentage = calculate_class_percentage(y_fit)
test_class_percentage = calculate_class_percentage(y_test)

# Print class percentages for each dataset
print("Fit set class percentages:")
print(fit_class_percentage )
print("\nTest set class percentages:")
print(test_class_percentage)
print('\n Number of rows of X fit', X_fit.shape[0], '\n Number of rows of X test', X_test.shape[0],'\n Number of rows of Y fit', y_fit.shape[0],'\n Number of rows of y test', y_test.shape[0],)


Fit set class percentages:
{0: 75.26328882991956, 1: 24.73671117008044}

Test set class percentages:
{0: 75.02487562189056, 1: 24.975124378109452}

 Number of rows of X fit 36177 
 Number of rows of X test 9045 
 Number of rows of Y fit 36177 
 Number of rows of y test 9045


In [82]:
# Save data to a pickle file
with open('data.pickle', 'wb') as f:
    pickle.dump((X_fit, y_fit, X_test, y_test), f)