## libraries

In [6]:
import pandas as pd
import numpy as np


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

In [7]:
df = pd.read_pickle('df_features.pkl')


## pre process

In [8]:
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  Income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [10]:
#  Cleaning target value and making it a binary classification

#Target value has values like '>50K.'.
df['Income'] = df['Income'].str.replace('.', '')

df['Income'] = df['Income'].str.replace('<=50K', '0')  # Replace with string '0'
df['Income'] = df['Income'].str.replace('>50K', '1')   # Replace with string '1'

# Convert to integer
df['Income'] = df['Income'].astype(int)

  df['Income'] = df['Income'].str.replace('.', '')


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  Income          48842 non-null  int32 
dtypes: int32(1), int64(6), object(8)
memory usage: 5.4+ MB


In [12]:
df['Income'].value_counts()

0    37155
1    11687
Name: Income, dtype: int64

In [13]:
# Count and percentage of Target classes variable

# Calculate the counts of unique values in the 'class' column of 'df_class_feature' and store it in 'class_counts'.
class_counts = df['Income'].value_counts()
# Calculate the percentage of each unique value in the 'class' column by dividing 'class_counts' by its sum and then multiplying by 100.
class_percentages = class_counts / class_counts.sum() * 100

print(class_counts)
print(class_percentages)
print()
a = df.shape[0]
print('Total number of rows: ', a)

0    37155
1    11687
Name: Income, dtype: int64
0    76.071823
1    23.928177
Name: Income, dtype: float64

Total number of rows:  48842


In [14]:
#Some values are like '?'. Replacing them as NaN 
df[df == '?'] = np.nan

In [15]:
# Dropping the rows with NaN values in  'workclass', 'occupation', 'native-country' for the test dataset
df.dropna(subset=['workclass', 'occupation', 'native-country'], inplace=True)
df.dropna( inplace=True)

In [16]:
# Count and percentage of Target classes values after droping NaN values 

# Calculate the counts of unique values in the 'class' column of 'df_class_feature' and store it in 'class_counts'.
class_counts = df['Income'].value_counts()

# Calculate the percentage of each unique value in the 'class' column by dividing 'class_counts' by its sum and then multiplying by 100.
class_percentages = class_counts / class_counts.sum() * 100

print(class_counts)
print(class_percentages)
print()
b = df.shape[0]
print('Number of rows after dropping NaN: ', b)
print('number of rows dropped: ', a-b)

0    34014
1    11208
Name: Income, dtype: int64
0    75.215603
1    24.784397
Name: Income, dtype: float64

Number of rows after dropping NaN:  45222
number of rows dropped:  3620


In [17]:
# Checking wheter Education and Education-number are similar. Print the first 5 rows of 'df_class_feature'
print(df['education'].value_counts().head())
print()
print(df['education-num'].value_counts().head())

HS-grad         14783
Some-college     9899
Bachelors        7570
Masters          2514
Assoc-voc        1959
Name: education, dtype: int64

9     14783
10     9899
13     7570
14     2514
11     1959
Name: education-num, dtype: int64


### Creating X and y sets

In [18]:
# From X set, education column is also dropped as it is similar to Education-num
X = df.drop(['Income', 'education' ], axis=1)

y = df['Income']

In [19]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             45222 non-null  int64 
 1   workclass       45222 non-null  object
 2   fnlwgt          45222 non-null  int64 
 3   education-num   45222 non-null  int64 
 4   marital-status  45222 non-null  object
 5   occupation      45222 non-null  object
 6   relationship    45222 non-null  object
 7   race            45222 non-null  object
 8   sex             45222 non-null  object
 9   capital-gain    45222 non-null  int64 
 10  capital-loss    45222 non-null  int64 
 11  hours-per-week  45222 non-null  int64 
 12  native-country  45222 non-null  object
dtypes: int64(6), object(7)
memory usage: 4.8+ MB


In [20]:
y.value_counts()

0    34014
1    11208
Name: Income, dtype: int64

### Split data into separate fitting and test set

In [21]:
# Split the dataset into fitting data (70%) and test set (30%)
X_fit, X_test, y_fit, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


In [22]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31655 entries, 34605 to 35636
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             31655 non-null  int64 
 1   workclass       31655 non-null  object
 2   fnlwgt          31655 non-null  int64 
 3   education-num   31655 non-null  int64 
 4   marital-status  31655 non-null  object
 5   occupation      31655 non-null  object
 6   relationship    31655 non-null  object
 7   race            31655 non-null  object
 8   sex             31655 non-null  object
 9   capital-gain    31655 non-null  int64 
 10  capital-loss    31655 non-null  int64 
 11  hours-per-week  31655 non-null  int64 
 12  native-country  31655 non-null  object
dtypes: int64(6), object(7)
memory usage: 3.4+ MB


In [23]:
# Defined a function to calculate  and print the percentage of each class in the target variable
def calculate_class_percentage(y):
    class_percentage = {}
    total_samples = len(y)
    unique_classes = set(y)
    
    for cls in unique_classes:
        class_count = sum(y == cls)
        percentage = (class_count / total_samples) * 100
        class_percentage[cls] = percentage
    
    return class_percentage

# Calculate class percentages for each dataset
fit_class_percentage = calculate_class_percentage(y_fit)
test_class_percentage = calculate_class_percentage(y_test)

# Print class percentages for each dataset
print("Fit set class percentages:")
print(fit_class_percentage )
print("\nTest set class percentages:")
print(test_class_percentage)
print('\n Number of rows of X fit', X_fit.shape[0], '\n Number of rows of X test', X_test.shape[0],'\n Number of rows of Y fit', y_fit.shape[0],'\n Number of rows of y test', y_test.shape[0],)


Fit set class percentages:
{0: 75.13504975517296, 1: 24.864950244827043}

Test set class percentages:
{0: 75.40355273826196, 1: 24.596447261738042}

 Number of rows of X fit 31655 
 Number of rows of X test 13567 
 Number of rows of Y fit 31655 
 Number of rows of y test 13567


In [24]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31655 entries, 34605 to 35636
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             31655 non-null  int64 
 1   workclass       31655 non-null  object
 2   fnlwgt          31655 non-null  int64 
 3   education-num   31655 non-null  int64 
 4   marital-status  31655 non-null  object
 5   occupation      31655 non-null  object
 6   relationship    31655 non-null  object
 7   race            31655 non-null  object
 8   sex             31655 non-null  object
 9   capital-gain    31655 non-null  int64 
 10  capital-loss    31655 non-null  int64 
 11  hours-per-week  31655 non-null  int64 
 12  native-country  31655 non-null  object
dtypes: int64(6), object(7)
memory usage: 3.4+ MB


In [25]:
# Label encoding for ordinal categorical variables. First fitting and transforming on fit set and then transforming on test set
LE = LabelEncoder()

X_fit.iloc[:, 1] = LE.fit_transform(X_fit.iloc[:, 1]) # Workclass
X_test.iloc[:, 1] = LE.transform(X_test.iloc[:, 1])

'''
X_fit.iloc[:, 4] = LE.fit_transform(X_fit.iloc[:, 4]) # Marital-status
X_test.iloc[:, 4] = LE.transform(X_test.iloc[:, 4])

X_fit.iloc[:, 5] = LE.fit_transform(X_fit.iloc[:, 5]) # Occupation
X_test.iloc[:, 5] = LE.transform(X_test.iloc[:, 5])

X_fit.iloc[:, 6] = LE.fit_transform(X_fit.iloc[:, 6]) # Relationship
X_test.iloc[:, 6] = LE.transform(X_test.iloc[:, 6])

X_fit.iloc[:, 7] = LE.fit_transform(X_fit.iloc[:, 7]) # Race
X_test.iloc[:, 7] = LE.transform(X_test.iloc[:, 7])

X_fit.iloc[:, 8] = LE.fit_transform(X_fit.iloc[:, 8]) # Sex
X_test.iloc[:, 8] = LE.transform(X_test.iloc[:, 8])

X_fit.iloc[:, 12] = LE.fit_transform(X_fit.iloc[:, 12]) # Native-country
X_test.iloc[:, 12] = LE.transform(X_test.iloc[:, 12])'''

  X_fit.iloc[:, 1] = LE.fit_transform(X_fit.iloc[:, 1]) # Workclass
  X_test.iloc[:, 1] = LE.transform(X_test.iloc[:, 1])


'\nX_fit.iloc[:, 4] = LE.fit_transform(X_fit.iloc[:, 4]) # Marital-status\nX_test.iloc[:, 4] = LE.transform(X_test.iloc[:, 4])\n\nX_fit.iloc[:, 5] = LE.fit_transform(X_fit.iloc[:, 5]) # Occupation\nX_test.iloc[:, 5] = LE.transform(X_test.iloc[:, 5])\n\nX_fit.iloc[:, 6] = LE.fit_transform(X_fit.iloc[:, 6]) # Relationship\nX_test.iloc[:, 6] = LE.transform(X_test.iloc[:, 6])\n\nX_fit.iloc[:, 7] = LE.fit_transform(X_fit.iloc[:, 7]) # Race\nX_test.iloc[:, 7] = LE.transform(X_test.iloc[:, 7])\n\nX_fit.iloc[:, 8] = LE.fit_transform(X_fit.iloc[:, 8]) # Sex\nX_test.iloc[:, 8] = LE.transform(X_test.iloc[:, 8])\n\nX_fit.iloc[:, 12] = LE.fit_transform(X_fit.iloc[:, 12]) # Native-country\nX_test.iloc[:, 12] = LE.transform(X_test.iloc[:, 12])'

In [26]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31655 entries, 34605 to 35636
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             31655 non-null  int64 
 1   workclass       31655 non-null  int32 
 2   fnlwgt          31655 non-null  int64 
 3   education-num   31655 non-null  int64 
 4   marital-status  31655 non-null  object
 5   occupation      31655 non-null  object
 6   relationship    31655 non-null  object
 7   race            31655 non-null  object
 8   sex             31655 non-null  object
 9   capital-gain    31655 non-null  int64 
 10  capital-loss    31655 non-null  int64 
 11  hours-per-week  31655 non-null  int64 
 12  native-country  31655 non-null  object
dtypes: int32(1), int64(6), object(6)
memory usage: 3.3+ MB


#### One-hot-encoding for non-ordinal catagorical variables, or dummification 

In [27]:
# Identify columns that are not ordinal and need one-hot encoding
columns_to_one_hot_encode = ['marital-status', 'occupation',  'relationship', 'race', 'sex', 'native-country'] # List of non-ordinal columns

# Create a OneHotEncoder instance
one_hot_encoder = OneHotEncoder(sparse=False)  # Create a OneHotEncoder instance it indicates that the output should be in dense array format. In dense format, all elements of the transformed data are stored, regardless of whether they are zero or non-zero.

# Apply one-hot encoding to the training dataset
one_hot_encoded_columns_train = one_hot_encoder.fit_transform(X_fit[columns_to_one_hot_encode])
# Apply the same transformation to the test dataset
one_hot_encoded_columns_test = one_hot_encoder.transform(X_test[columns_to_one_hot_encode])

# Convert the one-hot encoded columns to a DataFrame
one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_columns_train, columns=one_hot_encoder.get_feature_names_out(columns_to_one_hot_encode))
# Convert the one-hot encoded columns to a DataFrame
one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_columns_test, columns=one_hot_encoder.get_feature_names_out(columns_to_one_hot_encode))

# Drop the original columns from X_train
X_fit.drop(columns_to_one_hot_encode, axis=1, inplace=True)
# Drop the original columns from X_test
X_test.drop(columns_to_one_hot_encode, axis=1, inplace=True)

# Concatenate the original DataFrame with the one-hot encoded DataFrame
X_fit = pd.concat([X_fit.reset_index(drop=True), one_hot_encoded_df_train.reset_index(drop=True)], axis=1)
# Concatenate the original DataFrame with the one-hot encoded DataFrame
X_test = pd.concat([X_test.reset_index(drop=True), one_hot_encoded_df_test.reset_index(drop=True)], axis=1)




In [28]:
X_fit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31655 entries, 0 to 31654
Data columns (total 82 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   age                                        31655 non-null  int64  
 1   workclass                                  31655 non-null  int32  
 2   fnlwgt                                     31655 non-null  int64  
 3   education-num                              31655 non-null  int64  
 4   capital-gain                               31655 non-null  int64  
 5   capital-loss                               31655 non-null  int64  
 6   hours-per-week                             31655 non-null  int64  
 7   marital-status_Divorced                    31655 non-null  float64
 8   marital-status_Married-AF-spouse           31655 non-null  float64
 9   marital-status_Married-civ-spouse          31655 non-null  float64
 10  marital-status_Married

In [29]:
# Defined a function to calculate  and print the percentage of each class in the target variable
def calculate_class_percentage(y):
    class_percentage = {}
    total_samples = len(y)
    unique_classes = set(y)
    
    for cls in unique_classes:
        class_count = sum(y == cls)
        percentage = (class_count / total_samples) * 100
        class_percentage[cls] = percentage
    
    return class_percentage

# Calculate class percentages for each dataset
fit_class_percentage = calculate_class_percentage(y_fit)
test_class_percentage = calculate_class_percentage(y_test)

# Print class percentages for each dataset
print("Fit set class percentages:")
print(fit_class_percentage )
print("\nTest set class percentages:")
print(test_class_percentage)
print('\n Number of rows of X fit', X_fit.shape[0], '\n Number of rows of X test', X_test.shape[0],'\n Number of rows of Y fit', y_fit.shape[0],'\n Number of rows of y test', y_test.shape[0],)


Fit set class percentages:
{0: 75.13504975517296, 1: 24.864950244827043}

Test set class percentages:
{0: 75.40355273826196, 1: 24.596447261738042}

 Number of rows of X fit 31655 
 Number of rows of X test 13567 
 Number of rows of Y fit 31655 
 Number of rows of y test 13567


In [30]:
# Save data to a pickle file
with open('data.pickle', 'wb') as f:
    pickle.dump((X_fit, y_fit, X_test, y_test), f)