In [None]:
!pip install ucimlrepo

In [None]:
!pip install feature-fabrica --upgrade

# Import Packages

In [None]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

# Load Data

In [None]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


In [None]:
X.head()

In [None]:
adult.metadata.keys()

In [None]:
adult.metadata.abstract

In [None]:
adult.variables

# Data Cleaning

In [None]:
X.replace('?', pd.NA, inplace=True)

In [None]:
X.dtypes

In [None]:
columns_with_missing =  ['workclass', 'occupation', 'native-country']
for col in columns_with_missing:
  print(f"Percentage of values missing for column {col}: {X[col].isna().sum()/len(X)}")

In [None]:
missing_one_hot_df = X.isna().astype(int)[columns_with_missing]
missing_one_hot_df.columns = [c + '_missing' for c in columns_with_missing]
X = pd.concat([X, missing_one_hot_df], axis=1)

In [None]:
# Fill missing values with mode (for categorical variables)
for col in columns_with_missing:
  X[col].fillna(X[col].mode()[0], inplace=True)

In [None]:
X.head()

In [None]:
df = pd.concat([X, y], axis=1)

In [None]:
df.head()

In [None]:
# we need to get rid of "." in labels
y.value_counts()

# Visualizations

In [None]:
# data is imbalanced
X.hist(figsize = (15, 10), bins= 60)

In [None]:
def pieplot(df, column):
    label = df[column].unique().tolist()
    data = df[column].value_counts()
    plt.figure(figsize = (5, 5))
    plt.pie(data, labels = label, autopct = "%.0f%%")
    plt.title(column)
    plt.show()

In [None]:
pieplot(X, "sex")

In [None]:
pieplot(X, "education")

In [None]:
pieplot(X, "race")

In [None]:
pieplot(X, "marital-status")

# Baseline Model

In [None]:
BASIC_FEATURE_DEFINITION = f"""
# examples/basic_features.yaml
age:
  description: "age"
  data_type: "int32"

workclass:
  description: "Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.LabelEncode

fnlwgt:
  description: "Final weight, which represents the number of people the observation is meant to represent."
  data_type: "int32"

education:
  description: "Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

education-num:
  description: "Numeric representation of education level."
  data_type: "int32"

marital-status:
  description: "Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

occupation:
  description: "Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

relationship:
  description: "Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

race:
  description: "White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

sex:
  description: "Female, Male."
  data_type: "int32"
  transformation:
    label_encode:
      _target_: feature_fabrica.transform.LabelEncode

capital-gain:
  description: "Capital gains in dollars."
  data_type: "int32"

capital-loss:
  description: "Capital losses in dollars."
  data_type: "int32"

hours-per-week:
  description: "Hours worked per week."
  data_type: "int32"

native-country:
  description: "United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, etc."
  data_type: "int32"
  transformation:
    one_hot:
      _target_: feature_fabrica.transform.OneHotEncode

income:
  description: "Income classification: >50K or <=50K."
  data_type: "int32"
  transformation:
    strip:
      _target_: feature_fabrica.transform.Strip
      chars: '.'
    label:
      _target_: feature_fabrica.transform.LabelEncode
      categories: ["<=50K", ">50K"]
"""


In [None]:
import yaml

# Convert the string to a Python dictionary
feature_definition_dict = yaml.safe_load(BASIC_FEATURE_DEFINITION)

# Specify the path where you want to save the YAML file
yaml_file_path = 'basic_features.yaml'

# Save the dictionary to a YAML file
with open(yaml_file_path, 'w') as file:
    yaml.dump(feature_definition_dict, file, default_flow_style=False, sort_keys=False)

print(f"YAML file saved as {yaml_file_path}")


In [None]:
df_dict = {
    col: df[col].to_numpy(dtype=np.str_) if df[col].dtype == 'object' else df[col].to_numpy(dtype=np.int32)
    for col in df.columns
}

In [None]:
import numpy as np
from feature_fabrica.core import FeatureManager

feature_manager = FeatureManager(
    config_path="./", config_name="basic_features"
)
results = feature_manager.compute_features(df_dict)

In [None]:
feature_manager.get_visual_dependency_graph()

In [None]:
# Convert 2D arrays to a flat structure for DataFrame creation
flattened_dict = {}
for col, values in results.items():
    if values.ndim == 2:
        # Flatten 2D arrays and add column names for each feature
        for i in range(values.shape[1]):
            flattened_dict[f"{col}_{sorted(list(X[col].unique()))[i]}"] = values[:, i]
    else:
        # Keep 1D arrays as is
        flattened_dict[col] = values

# Create DataFrame from flattened dictionary
df_processed = pd.DataFrame(flattened_dict)

In [None]:
df_processed.head()

In [None]:
y = df_processed['income']
X = df_processed.drop(columns='income')
# Ensure that X and y have the same number of samples
print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test data
y_pred = rf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# Get feature importances
importances = rf.feature_importances_
features = X.columns
# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Print feature importances
print(feature_importances)

In [None]:
# Select top 10 important features
top_10_features = feature_importances.head(10)

# Print top 10 feature importances
print(top_10_features)

# Plot top 10 feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_10_features['feature'], top_10_features['importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances from Random Forest')
plt.gca().invert_yaxis()
plt.show()

# Let's cook some more features

In [None]:
EXTENDED_FEATURE_DEFINITION = f"""
work_intensity:
  description: "This gives a rough estimate of the ratio of time worked per week relative to age. Younger individuals working more hours may indicate higher work intensity."
  data_type: "float32"
  dependencies: [age, hours-per-week]
  transformation:
    divide:
      _target_: feature_fabrica.transform.DivideTransform
      numerator: hours-per-week
      denominator: age

cap_gain_hours_week:
  description: "Highlights individuals with substantial capital gains who may work fewer hours but still achieve high income."
  data_type: "float32"
  dependencies: [capital-gain, hours-per-week]
  transformation:
    divide:
      _target_: feature_fabrica.transform.DivideTransform
      numerator: capital-gain
      denominator: hours-per-week

education_investment:
  description: "Helps identify individuals who have invested a higher proportion of their life in education, which might impact income levels."
  data_type: "float32"
  dependencies: [education-num, age]
  transformation:
    divide:
      _target_: feature_fabrica.transform.DivideTransform
      numerator: education-num
      denominator: age

cap_gain_adjusted:
  description: "This adjusted value provides a clearer picture of the actual financial gains from capital."
  data_type: "int32"
  dependencies: [capital-gain, capital-loss]
  transformation:
    subtract:
      _target_: feature_fabrica.transform.SubtractReduce
      iterable: [capital-gain, capital-loss]

financial_gain_to_work_experience:
  description: "This gives ratio of the actual financial gains to the assumed years of work experience"
  data_type: "float32"
  dependencies: [age, education-num, cap_gain_adjusted]
  transformation:
    subtract:
      _target_: feature_fabrica.transform.SubtractReduce
      iterable: [age, education-num]
    divide:
      _target_: feature_fabrica.transform.DivideTransform
      numerator: cap_gain_adjusted

total_cap_gain_workclass:
  description: "Mean capital gain per workclass."
  data_type: "float32"
  dependencies: [cap_gain_adjusted, workclass]
  transformation:
    import:
      _target_: feature_fabrica.transform.FeatureImporter
      feature: cap_gain_adjusted
      transform_stage: subtract
    group:
      _target_: feature_fabrica.transform.GroupByReduce
      key_feature: workclass
      reduce_func: mean
      axis: -1

avg_education_investment_age:
  description: "Mean education ivestment per age."
  data_type: "float32"
  dependencies: [education_investment, age]
  transformation:
    import:
      _target_: feature_fabrica.transform.FeatureImporter
      feature: education_investment
      transform_stage: divide
    group:
      _target_: feature_fabrica.transform.GroupByReduce
      key_feature: age
      reduce_func: mean
      axis: -1

education_investment_to_avg:
  description: "Ratio between person's education investment to the average of their age."
  data_type: "float32"
  dependencies: [education_investment, avg_education_investment_age]
  transformation:
    divide:
      _target_: feature_fabrica.transform.DivideTransform
      numerator: education_investment
      denominator: avg_education_investment_age
"""

In [None]:
# let's reload our basic features
basic_feature_definition_dict = yaml.safe_load(BASIC_FEATURE_DEFINITION)
# load new features
extended_feature_definition_dict = yaml.safe_load(EXTENDED_FEATURE_DEFINITION)
# add them together
feature_definition_dict = basic_feature_definition_dict | extended_feature_definition_dict

# Specify the path where you want to save the YAML file
yaml_file_path = 'extended_features.yaml'

# Save the dictionary to a YAML file
with open(yaml_file_path, 'w') as file:
    yaml.dump(feature_definition_dict, file, default_flow_style=False, sort_keys=False)

print(f"YAML file saved as {yaml_file_path}")


In [None]:
feature_manager = FeatureManager(
    config_path="./", config_name="extended_features"
)
results = feature_manager.compute_features(df_dict)

In [None]:
feature_manager.get_visual_dependency_graph()

In [None]:
# Convert 2D arrays to a flat structure for DataFrame creation
flattened_dict = {}
for col, values in results.items():
    if values.ndim == 2:
        # Flatten 2D arrays and add column names for each feature
        for i in range(values.shape[1]):
            flattened_dict[f"{col}_{sorted(list(df[col].unique()))[i]}"] = values[:, i]
    else:
        # Keep 1D arrays as is
        flattened_dict[col] = values

# Create DataFrame from flattened dictionary
df_processed_extended = pd.DataFrame(flattened_dict)

In [None]:
# vibe check
df_processed_extended[['capital-gain', 'capital-loss','total_cap_gain_workclass', 'avg_education_investment_age', 'education_investment', 'education_investment_to_avg', 'income']].head(20)

In [None]:
y = df_processed_extended['income']
X = df_processed_extended.drop(columns='income')
# Ensure that X and y have the same number of samples
print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Initialize and train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test data
y_pred = rf.predict(X_test)

# We improved accuracy by 1%

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# Get feature importances
importances = rf.feature_importances_
features = X.columns
# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Print feature importances
print(feature_importances)

In [None]:
# Select top 10 important features
top_10_features = feature_importances.head(10)

# Print top 10 feature importances
print(top_10_features)

# Plot top 10 feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_10_features['feature'], top_10_features['importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances from Random Forest')
plt.gca().invert_yaxis()
plt.show()