# data exploration and preprocessing

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Step 1: Load the dataset and conduct basic data exploration
data = pd.read_csv('adult_with_headers.csv')

# Summary statistics
summary_statistics = data.describe()

# Missing values
missing_values = data.isnull().sum()

# Data types
data_types = data.dtypes

# Step 2: Handle missing values
# Separate numeric and non-numeric columns
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
non_numeric_columns = data.select_dtypes(exclude=['int64', 'float64']).columns

# Impute missing values for numeric columns with mean
imputer_numeric = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer_numeric.fit_transform(data[numeric_columns])

# Drop rows with missing values for non-numeric columns
data.dropna(subset=non_numeric_columns, inplace=True)

# Step 3: Apply scaling techniques to numerical features
# Standard Scaling
scaler_standard = StandardScaler()
data_standard_scaled = pd.DataFrame(scaler_standard.fit_transform(data[numeric_columns]), columns=numeric_columns)

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
data_minmax_scaled = pd.DataFrame(scaler_minmax.fit_transform(data[numeric_columns]), columns=numeric_columns)

# Step 4: Discuss scenarios for each scaling technique
# Your discussion of scenarios for each scaling technique here...

# Print summary statistics, missing values, and data types
print("Summary Statistics:")
print(summary_statistics)
print("\nMissing Values:")
print(missing_values)
print("\nData Types:")
print(data_types)


Summary Statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  

Missing Values:
age               0
workcl

# Encoding techniques

In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Step 1: Load the dataset
data = pd.read_csv('adult_with_headers.csv')

# Step 2: Identify categorical variables
categorical_columns = data.select_dtypes(include=['object']).columns

# Step 3: Apply One-Hot Encoding and Label Encoding
one_hot_columns = []
label_encoded_columns = []

for col in categorical_columns:
    if len(data[col].unique()) <= 5:  # Apply One-Hot Encoding for variables with <= 5 categories
        one_hot_encoder = OneHotEncoder(drop='first')
        encoded_col = pd.DataFrame(one_hot_encoder.fit_transform(data[[col]]).toarray(), columns=[col + '_' + str(int(i)) for i in range(1, len(data[col].unique()))])
        data = pd.concat([data, encoded_col], axis=1)
        one_hot_columns.extend(encoded_col.columns)
    else:  # Apply Label Encoding for variables with > 5 categories
        label_encoder = LabelEncoder()
        data[col] = label_encoder.fit_transform(data[col])
        label_encoded_columns.append(col)

# Step 4: Discuss the pros and cons of One-Hot Encoding and Label Encoding
# Your discussion of pros and cons here...

# Print the encoded dataset
print("Encoded Dataset:")
print(data.head())


Encoded Dataset:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship    race      sex  ...  capital_loss  \
0           1             1   White     Male  ...             0   
1           4             0   White     Male  ...             0   
2           6             1   White     Male  ...             0   
3           6             0   Black     Male  ...             0   
4          10             5   Black   Female  ...             0   

   hours_per_week  native_country  income race_1  race_2  race_3  race_4  \
0              40              39   <=50K    0.0     0.0     0.0     1.0   
1

# Feature engineering

In [11]:
import pandas as pd

# Step 1: Load the dataset
data = pd.read_csv('adult_with_headers.csv')

# Step 2: Convert 'income' and 'capital_gain' columns to numeric data types
data['income'] = pd.to_numeric(data['income'], errors='coerce')
data['capital_gain'] = pd.to_numeric(data['capital_gain'], errors='coerce')

# Step 3: Create new features
# Example 1: Total income from 'income' and 'capital_gain' columns
data['Total_Income'] = data['income'] + data['capital_gain']

# Example 2: Age squared
data['Age_Squared'] = data['age'] ** 2

# Print the updated dataset
print("Updated Dataset:")
print(data.head())


Updated Dataset:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  \
0          2174             0              40   Un

In [14]:
!pip install ppscore


Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: ppscore
  Building wheel for ppscore (pyproject.toml): started
  Building wheel for ppscore (pyproject.toml): finished with status 'done'
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13200 sha256=df1355bf0e5f10fa21b2b0a27547126015e136ced3ec1153e242b05d32d42557
  Stored in directory: c:\users\siris\appdata\local\pip\cache\wheels\42\87\10\00056aa2d2624f1b9374db6a0d5245da9a3d87bdc9247c1a56
Successfully built ppscore
Installing c

# Feature selection

In [18]:
print(data.columns)


Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')


In [19]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import ppscore as pps

# Step 1: Load the dataset
data = pd.read_csv('adult_with_headers.csv')

# Step 2: Create new features
# Example 1: Total income from 'fnlwgt' and 'capital_loss' columns
data['Total_Income'] = data['fnlwgt'] + data['capital_loss']

# Example 2: Age squared
data['Age_Squared'] = data['age'] ** 2

# Step 3: Identify and remove outliers using Isolation Forest
# Assuming 'Total_Income' and 'Age_Squared' are the relevant numerical features for outlier detection
outlier_detector = IsolationForest(contamination=0.05)  # Adjust contamination parameter as needed
outlier_detector.fit(data[['Total_Income', 'Age_Squared']])
outlier_mask = outlier_detector.predict(data[['Total_Income', 'Age_Squared']]) == 1  # 1 indicates inliers
cleaned_data = data[outlier_mask]

# Step 4: Apply the PPS (Predictive Power Score) to find and discuss the relationships between features
# PPS analysis requires the installation of the 'ppscore' library, which can be installed via pip
# After installation, you can perform PPS analysis on the DataFrame
pps_matrix = pps.matrix(cleaned_data)

# Step 5: Compare PPS findings with the correlation matrix
correlation_matrix = cleaned_data.corr()

# Print the results
print("Cleaned Data:")
print(cleaned_data.head())
print("\nPPS Matrix:")
print(pps_matrix)
print("\nCorrelation Matrix:")
print(correlation_matrix)




Cleaned Data:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  \
0          2174             0              40   Unite