In [31]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from splitter import splitter  # import your function from splitter.py
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

df = pd.read_csv("topic21_v9_train.csv")
df.head()


# Provide the path to your CSV file
file_path = "topic21_v9_train.csv"

# Split the data
X_train, X_test, y_train, y_test = splitter(file_path)

# Optional: check the shapes
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)



X_train: (6369, 18)
X_test: (1593, 18)
y_train: (6369,)
y_test: (1593,)


***1. Outlier Filtering on X_train (3σ rule)***

In [32]:
# Identify numeric columns in X_train
numeric_cols = X_train.select_dtypes(include=np.number).columns

# Calculate z-scores
z_scores = (X_train[numeric_cols] - X_train[numeric_cols].mean()) / X_train[numeric_cols].std()

# Keep rows where all z-scores are within ±3
mask = (np.abs(z_scores) <= 3).all(axis=1)

# Apply mask to both X_train and y_train
X_train_clean = X_train[mask].reset_index(drop=True)
y_train_clean = y_train[mask].reset_index(drop=True)

print("Cleaned X_train:", X_train_clean.shape)


Cleaned X_train: (4269, 18)


***2. Split a Validation Set from the Cleaned Training Data***

In [33]:
# 20% validation set from the cleaned training data
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_clean, y_train_clean, test_size=0.2, random_state=42
)

print("Train set:", X_train_final.shape)
print("Validation set:", X_val.shape)

Train set: (3415, 18)
Validation set: (854, 18)


***3. Preprocessing Setup: Imputation, Scaling, One-Hot Encoding***

In [34]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include="object").columns
numerical_cols = X_train.select_dtypes(include=np.number).columns

# Define transformers
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])


***4. Build Pipeline and Do Cross-Validation***

In [35]:
# Final pipeline
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

# Cross-validation on training set
cv_results = cross_validate(pipe, X_train_final, y_train_final, cv=5,
                            scoring="neg_mean_squared_error", return_train_score=True)

# Print mean CV scores
print("Mean CV MSE:", -np.mean(cv_results["test_score"]))

Mean CV MSE: 22630654976.193504


***5. Evaluate on Held-Out Validation Set***


In [36]:
# Fit on train split and predict on validation split
pipe.fit(X_train_final, y_train_final)
y_val_pred = pipe.predict(X_val)

# Compute validation MSE
val_mse = mean_squared_error(y_val, y_val_pred)
print("Validation MSE:", val_mse)


Validation MSE: 24881532092.500015


In [37]:
# MSE
val_mse = mean_squared_error(y_val, y_val_pred)

# MAE
val_mae = mean_absolute_error(y_val, y_val_pred)

# RMSE
val_rmse = np.sqrt(val_mse)

# R^2 Score
val_r2 = r2_score(y_val, y_val_pred)

# Print all metrics
print(f"Validation MSE:  {val_mse:.4f}")
print(f"Validation MAE:  {val_mae:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Validation R²:   {val_r2:.4f}")

Validation MSE:  24881532092.5000
Validation MAE:  89155.4527
Validation RMSE: 157738.8097
Validation R²:   0.3677


***Trying feature engineering***

In [39]:
from sklearn.preprocessing import FunctionTransformer

def add_custom_features(X):
    X = X.copy()
    
    # Example 1: Create a ratio of two existing columns
    if 'feature1' in X.columns and 'feature2' in X.columns:
        X['feature1_to_feature2'] = X['feature1'] / (X['feature2'] + 1e-5)
    
    # Example 2: Log-transform a numeric feature
    if 'price' in X.columns:
        X['log_price'] = np.log1p(X['price'])

    # Example 3: Interaction term
    if 'age' in X.columns and 'income' in X.columns:
        X['age_income_interaction'] = X['age'] * X['income']

    return X

feature_engineering = FunctionTransformer(add_custom_features)


In [43]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Add the transformer before preprocessing
pipe = Pipeline([
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor),
    ('regressor', model)
])