1. Data Exploration and Preprocessing

In [1]:
#Load the dataset and conduct basic data exploration:
import pandas as pd

data = pd.read_csv(r"/content/adult_with_headers.csv")

#Display summary statistics

print("Summary Statistics:\n", data.describe(), "\n")

#Check for missing values
print("Check for missing values:\n", data.isnull().sum(), "\n")

#Display data types
print("Data Types:\n", data.dtypes)


Summary Statistics:
                 age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000   

Check for missing values:
 age          

In [2]:
#Handle missing values
print(data.isnull().any().any())
#The output is False, which means that there are no missing values in this dataset

False


In [3]:
#Apply scaling techniques to numerical features

#To apply standard scaling and min-max scaling to numerical features in the dataset, we use scikit-learn's StandardScaler and MinMaxScaler classes.
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical features for scaling
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Standard Scaling
scaler_standard = StandardScaler()
data_standard_scaled = scaler_standard.fit_transform(data[numerical_features])
data_standard_scaled = pd.DataFrame(data_standard_scaled, columns=numerical_features)  # Convert to DataFrame
print("Standard scaling")
print(data_standard_scaled.head())

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
data_minmax_scaled = scaler_minmax.fit_transform(data[numerical_features])
data_minmax_scaled = pd.DataFrame(data_minmax_scaled, columns=numerical_features)  # Convert to DataFrame
print("Min-Max scaling")
print(data_minmax_scaled.head())


Standard scaling
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  
0       -0.035429  
1       -2.222153  
2       -0.035429  
3       -0.035429  
4       -0.035429  
Min-Max scaling
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       0.800000       0.00000           0.0   

   hours_per_week  
0        

2. Encoding Techniques

In [4]:
#To apply One-Hot Encoding and Label Encoding to categorical variables based on the number of categories, we use scikit-learn's OneHotEncoder and LabelEncoder classes.

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Identify categorical variables
categorical_variables = []
for column in data.columns:
    if data[column].dtype == 'object' and len(data[column].unique()) <= 5:
        categorical_variables.append(column)

# Apply One-Hot Encoding to categorical variables with less than 5 categories
onehot_encoder = OneHotEncoder(sparse=False, drop='first')
onehot_encoded = onehot_encoder.fit_transform(data[categorical_variables])
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=[f"{column}_{category}"
                                                         for column, categories in zip(categorical_variables, onehot_encoder.categories_)
                                                         for category in categories[1:]])
print(onehot_encoded_df)

# Use Label Encoding for categorical variables with more than 5 categories
label_encoder = LabelEncoder()
label_encoded_df = data.copy()
for column in data.columns:
    if data[column].dtype == 'object' and len(data[column].unique()) > 5:
        label_encoded_df[column] = label_encoder.fit_transform(data[column])
print(label_encoded_df)

       race_ Asian-Pac-Islander  race_ Black  race_ Other  race_ White  \
0                           0.0          0.0          0.0          1.0   
1                           0.0          0.0          0.0          1.0   
2                           0.0          0.0          0.0          1.0   
3                           0.0          1.0          0.0          0.0   
4                           0.0          1.0          0.0          0.0   
...                         ...          ...          ...          ...   
32556                       0.0          0.0          0.0          1.0   
32557                       0.0          0.0          0.0          1.0   
32558                       0.0          0.0          0.0          1.0   
32559                       0.0          0.0          0.0          1.0   
32560                       0.0          0.0          0.0          1.0   

       sex_ Male  income_ >50K  
0            1.0           0.0  
1            1.0           0.0  
2           



3. Feature Engineering

In [5]:
#Features beneficial for the model

#Feature 1: Capital Change
data['capital_change'] = data['capital_gain'] - data['capital_loss']
#This feature calculates the net capital change by subtracting capital_loss from capital_gain.
#Rationale: While capital_gain and capital_loss are important individual features, combining them into a single feature can provide additional information about the overall
#financial status of an individual. A positive value indicates a net capital gain, while a negative value indicates a net capital loss.

#Feature 2: Education Years
data['education_years'] = data['education_num'] + data['age']
#This feature calculates the total number of years spent on education by summing up education_num and age.
#Rationale: Education level and age are both important factors that can influence income.
#By combining them into a single feature, we can capture the cumulative effect of education and age on income,
#which might provide better predictive power to the model.

#Log transformation to atleast one skewed numerical feature

import numpy as np

# Apply log transformation to 'capital_gain'
data['capital_gain_log'] = np.log1p(data['capital_gain'])
#Justification: The 'capital_gain' feature is likely to be positively skewed, with a few individuals having very high capital gains compared to the majority.
#This skewness can make the distribution non-normal, which can negatively impact the performance of certain machine learning algorithms
#that assume normality or require symmetric distributions.By applying a log transformation, we can compress the range of values for 'capital_gain' while maintaining the relative differences between lower values. This helps in making the distribution more symmetric, reducing the impact of extreme values, and making the data more suitable for algorithms that assume normality or linear relationships between variables.



4. Feature Selection

In [6]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder

# Encode categorical variables
categorical_columns = data.select_dtypes(include=['object']).columns
data_encoded = pd.get_dummies(data, columns=categorical_columns)

# Remove missing values (if any)
data_encoded.dropna(inplace=True)

# Instantiate the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.05)  # Adjust contamination parameter as needed

# Fit the model to the data and predict outliers
outlier_labels = isolation_forest.fit_predict(data_encoded)

# Remove outliers from the dataset
data_no_outliers = data_encoded[outlier_labels == 1]




In [7]:
#To apply the Predictive Power Score (PPS) to find and discuss the relationships between features, we'll use the ppscore library
!pip install ppscore
import ppscore as pps
# Calculate the PPS matrix
pps_matrix = pps.matrix(data)

# Display the PPS matrix
print("PPS Matrix:")
print(pps_matrix)

# Compare with the correlation matrix
correlation_matrix = data.corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)





PPS Matrix:
                    x                 y   ppscore            case  \
0                 age               age  1.000000  predict_itself   
1                 age         workclass  0.011232  classification   
2                 age            fnlwgt  0.000000      regression   
3                 age         education  0.052315  classification   
4                 age     education_num  0.000000      regression   
..                ...               ...       ...             ...   
319  capital_gain_log    native_country  0.000000  classification   
320  capital_gain_log            income  0.297578  classification   
321  capital_gain_log    capital_change  0.845392      regression   
322  capital_gain_log   education_years  0.013555      regression   
323  capital_gain_log  capital_gain_log  1.000000  predict_itself   

     is_valid_score               metric  baseline_score   model_score  \
0              True                 None        0.000000      1.000000   
1          

  correlation_matrix = data.corr()
