<a href="https://colab.research.google.com/github/daniel2iq/ai-code/blob/main/Untitled99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install numpy pandas matplotlib seaborn
%pip install scikit-learn
!pip install category_encoders

import numpy as np
import pandas as pd

df = pd.read_csv("adult.csv")

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold, cross_val_score
from scipy.stats import zscore
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression

# Print a greeting to verify code runs
print("hello world")

# Show the first few rows of the dataset to see what we're working with
df.head()

# Find the number of missing values in the data so we can use simple imputer or other data imputation techniques
df.isnull().sum()

# Set features (X) to every column except income
x = df.drop("income", axis=1)

# Set target variable (y) to the income column
y = df["income"]

# Select numerical columns by dtype
numerical_cols = x.select_dtypes(include=["int64", "float64"]).columns

# Select categorical columns by dtype
categorical_cols = x.select_dtypes(include="object").columns

# Calculate absolute z-scores for numerical columns to identify outliers
z_scores = np.abs(zscore(x[numerical_cols]))

# Filter rows where all numerical z-scores are less than 3 standard deviations (remove outliers)
filtered_df = df[z_scores < 3].all(axis=1)

# Update x to only include filtered rows
x = x[filtered_df]

# Calculate skewness for numerical columns to identify skewed features
skewed = np.abs(x[numerical_cols].skew())

# Columns with skewness greater than 0.5 need transformation
skewed_cols = [col for col in numerical_cols if skewed[col] > 0.5]

# Non-skewed numerical columns
non_skewed_cols = [col for col in numerical_cols if col not in skewed_cols]

# Find the number of unique values in each categorical column
unique_cols = x[categorical_cols].nunique()

# Columns with fewer than 10 unique values (small unique)
small_unique = [col for col in categorical_cols if unique_cols[col] < 10]

# Columns with 10 or more unique values (big unique)
big_unique = [col for col in categorical_cols if unique_cols[col] >= 10]

# Pipeline to transform skewed numerical columns: power transform then scale
skewed_cols_pipeline = Pipeline([
    ("power", PowerTransformer(method="yeo-johnson")),
    ("scale", StandardScaler())
])

# Pipeline to scale non-skewed numerical columns
non_skewed_cols_pipeline = Pipeline([
    ("scale", StandardScaler())
])

# Pipeline for small unique categorical columns - one hot encoding
small_unique_cols_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Pipeline for big unique categorical columns - target encoding
big_unique_cols_pipeline = Pipeline([
    ("target", TargetEncoder())
])

# Column transformer to apply respective pipelines to correct columns
preprocessor = ColumnTransformer([
    ("skewed", skewed_cols_pipeline, skewed_cols),
    ("non_skewed", non_skewed_cols_pipeline, non_skewed_cols),
    ("small_unique", small_unique_cols_pipeline, small_unique),
    ("big_unique", big_unique_cols_pipeline, big_unique)
])

# Final pipeline: preprocessing, feature selection, polynomial features, and logistic regression model
final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=mutual_info_classif, k=10)),
    ("polynomial_features", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
    ("model", LogisticRegression())
])

# K-Fold cross-validation setup (5 splits, shuffle to avoid bias)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Calculate cross-validation scores
scores = cross_val_score(final_pipeline, x, y, cv=kf)

print(f"The non-skewed columns are: {non_skewed_cols}")
print(f"Cross-validation scores: {scores}")
print(f"The mean cross-validation score is: {scores.mean()}")
print(f"The skewed columns are: {skewed_cols}")

print("Additional techniques I can use are:")
print("-" * 40)
print(". CountVectorizer and TF-IDF vectorizer for text data")
print(". Bayesian smoothing encoding to prevent data leakage")
print(". Custom transformers to fit into pipelines seamlessly")
print(". Feature crossing")
print(". Dimensionality reduction techniques like PCA")
print(". SimpleImputer, KNNImputer, and IterativeImputer for missing values")
print("BIG NOTE - I did not use any of these techniques here because they were not needed for this dataset.")
