# Statistical Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Gathering Data
def loding_data(path):
    try:
        clean_acis_df = pd.read_csv(path, low_memory=False)
    except Exception as e:
        print(f"Error on loading data: {e}")
    else:
        return clean_acis_df

In [3]:
# loding the data
path = "../data/clean_acis_data.csv"
clean_acis_df = loding_data(path)

# Data Preparation
Identify categorical and numerical columns:

In [4]:
clean_acis_df.drop('Unnamed: 0', axis=1, inplace=True)

# Categorize the data
categorical_cols = [col for col in clean_acis_df.columns if clean_acis_df[col].dtype == 'object']
numerical_cols = [col for col in clean_acis_df.columns if clean_acis_df[col].dtype in ['int64', 'float64'] and col != 'TotalPremium']




# Preprocessing Pipelines
Define preprocessing steps for numerical and categorical data:

In [5]:
# Numerical preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Model Pipelines
Define pipelines for each model:

In [6]:
# Linear Regression pipeline
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Decision Tree pipeline
pipeline_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

# Random Forest pipeline
pipeline_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# XGBoost pipeline
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))
])

# Train-Test Split
Split the data into training and test sets:

In [9]:
X = clean_acis_df.drop('TotalPremium', axis=1)
y = clean_acis_df['TotalPremium']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model Training and Evaluation
Train and evaluate each model:

In [10]:
def evaluate_model(pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2

# Linear Regression
mse_lr, mae_lr, r2_lr = evaluate_model(pipeline_lr, X_train, X_test, y_train, y_test)

# Decision Tree
mse_tree, mae_tree, r2_tree = evaluate_model(pipeline_tree, X_train, X_test, y_train, y_test)

# Random Forest
mse_forest, mae_forest, r2_forest = evaluate_model(pipeline_forest, X_train, X_test, y_train, y_test)

# XGBoost
mse_xgb, mae_xgb, r2_xgb = evaluate_model(pipeline_xgb, X_train, X_test, y_train, y_test)


# Feature Importance Analysis
For tree-based models, you can extract feature importances:

In [None]:
# Random Forest feature importances
forest_importances = pipeline_forest.named_steps['model'].feature_importances_
forest_features = X_train.columns[pipeline_forest.named_steps['preprocessor'].transformers_[0][2]]  # Extract numerical feature names
forest_feature_importances = pd.DataFrame({'feature': forest_features, 'importance': forest_importances})
print(forest_feature_importances.sort_values(by='importance', ascending=False))

# XGBoost feature importances
xgb_importances = pipeline_xgb.named_steps['model'].feature_importances_
xgb_features = X_train.columns[pipeline_xgb.named_steps['preprocessor'].transformers_[0][2]]  # Extract numerical feature names
xgb_feature_importances = pd.DataFrame({'feature': xgb_features, 'importance': xgb_importances})
print(xgb_feature_importances.sort_values(by='importance', ascending=False))


# Report Comparison Between Each Model's Performance
Summarize and compare the performance metrics of each model:

In [None]:
print(f"""
Linear Regression:
  MSE: {mse_lr}, MAE: {mae_lr}, R2: {r2_lr}
Decision Tree:
  MSE: {mse_tree}, MAE: {mae_tree}, R2: {r2_tree}
Random Forest:
  MSE: {mse_forest}, MAE: {mae_forest}, R2: {r2_forest}
XGBoost:
  MSE: {mse_xgb}, MAE: {mae_xgb}, R2: {r2_xgb}
""")