In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
file_path = '/Users/sprosad/Downloads/others/new_dea/onboardai/data/regression/house_price_train.csv'  # Update with your file path
df = pd.read_csv(file_path)
threshold = 0.5 * len(df)  # 70% of the total number of rows
target_col = 'SalePrice'

# Drop columns where the number of NaN values is greater than the threshold
df_cleaned = df.dropna(thresh=threshold, axis=1)
df_cleaned = df_cleaned.dropna(subset=[target_col])

# Function to check the percentage of digits in a string
def is_digit_heavy_string(value):
    if not isinstance(value, str):
        return False
    digit_count = sum(c.isdigit() for c in value)
    return digit_count / len(value) >= 0.9 if len(value) > 0 else False

def is_date_string(value):
    try:
        pd.to_datetime(value)
        return True
    except (ValueError, TypeError):
        return False

def custom_is_alphanumeric(value):
    if not isinstance(value, str):
        return False
    
    letters = sum(c.isalpha() for c in value)
    digits = sum(c.isdigit() for c in value)
    
    # Apply the rules
    if len(value) > 4:
        return letters >= 2 and digits >= 2
    else:
        return letters >= 1 and digits >= 1

# Function to tag columns
def tag_columns(df, target_col):
    column_tags = {'number':[],'string':[],'unknown':[],'date':[],'alphanumeric':[], 'predictor':target_col}
    
    for column in df.loc[:, df.columns != target_col]:
        if pd.api.types.is_numeric_dtype(df[column]):
            column_tags['number'].append(column) 
        elif pd.api.types.is_string_dtype(df[column]):
            # Check if at least 90% of the strings in the column are "digit-heavy"
            if df[column].apply(is_digit_heavy_string).mean() >= 0.9:
                column_tags['number'].append(column) 
            elif df[column].apply(is_date_string).mean() >= 0.9:
                column_tags['date'].append(column)
            elif df[column].apply(custom_is_alphanumeric).mean() >= 0.9:
                column_tags['alphanumeric'].append(column)
            else:
                column_tags['string'].append(column) 
        elif pd.api.types.is_datetime64_any_dtype(df[column]):
            column_tags['date'].append(column) 
        else:
            column_tags['unknown'].append(column)
    return column_tags

# Get tags for the columns
column_tags = tag_columns(df_cleaned, target_col)

for col in column_tags['number']:
    df_cleaned[col] = df_cleaned[col].astype(float)

for col in column_tags['string']:
    df_cleaned[col] = df_cleaned[col].astype(str)

for col in column_tags['date']:
    print("dat_col", col)
    df_cleaned[col] = pd.to_datetime(df_cleaned[col], format='%Y_%m_%d')

if pd.api.types.is_numeric_dtype(df_cleaned[target_col]):
    print("Target is Numeric")
elif pd.api.types.is_string_dtype(df[column]):
    df_cleaned[target_column] = df_cleaned[target_column].replace({'$': '', ',': '','%':''}, regex=True)
    if df_cleaned[target_col].astype(str).apply(is_digit_heavy_string).mean() >= 0.9:
        print("Target is not Numeric... converting it into numeric...")
        df_cleaned[target_col] = df_cleaned[target_col].astype(float)
    else:
        print(f'{target_col} cant be converted to numeric column')
else:
    print(f'{target_col} cant be converted to numeric column') 

today = pd.to_datetime('today')
for col in column_tags['date']:
    # Calculate the difference in days
    df_cleaned['days_difference_'+col] = (today - df_cleaned[col]).dt.days
    column_tags['number'].append('days_difference_'+col)
    
df_cleaned = df_cleaned.drop(column_tags['date'], axis=1)

#checking important string columns
for col in column_tags['string']:
    df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)
# Optionally: Trim whitespace
df_cleaned[column_tags['string']] = df_cleaned[column_tags['string']].apply(lambda x: x.str.strip())

cardinality = {col: df_cleaned[col].nunique() for col in column_tags['string']}
low_cardinality_cols = [col for col in column_tags['string'] if cardinality[col] < 10]
df_encoded = pd.get_dummies(df_cleaned, columns=low_cardinality_cols, drop_first=True)

new_column_names = [col for col in df_encoded.columns if col.startswith(tuple(low_cardinality_cols))]
original_to_new = {col: [new_col for new_col in new_column_names if new_col.startswith(col)] for col in low_cardinality_cols}

significant_cat_cols = []
for col in new_column_names:
    grouped = df_encoded.groupby(col)[target_col].mean()
    f_val, p_val = stats.f_oneway(*[group[target_col].values for name, group in df_encoded.groupby(col)])
    if p_val<=0.05:
       significant_cat_cols.append(col) 

#checking important numeric columns
for col in column_tags['number']:
    df_encoded[col].fillna(df_encoded[col].mean(), inplace=True)  # You can also use median or a specific value

numeric_cols = column_tags['number'].copy()
for col in column_tags['number']:
    if 'year' in col.lower():
        df_encoded['year_difference_'+col] = df_encoded[col].apply(lambda x: (today.year - x))
        numeric_cols.append('year_difference_'+col)
        numeric_cols.remove(col)

# Calculate the correlation matrix
correlation_matrix = df_encoded[numeric_cols+[target_col]].corr()

# Extract the correlation with the target variable
target_correlation = correlation_matrix[target_col].abs().sort_values(ascending=False)
# Set a correlation threshold
correlation_threshold = 0.2

# Filter significant columns based on the threshold
significant_numeric_columns = target_correlation[target_correlation > correlation_threshold].index.tolist()
significant_numeric_columns = [c for c in significant_numeric_columns if c!= target_col]
# print(f"Significant Numeric Columns: {significant_numeric_columns}")

# Prepare X (features) and y (target variable)
X = df_encoded[significant_numeric_columns]
y = df_encoded[target_col]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Store the significant columns based on p-values from the model summary
p_values = model.pvalues

# Filter significant predictors (excluding the constant)
significant_predictors = p_values[p_values <= 0.05].index.tolist()
if 'const' in significant_predictors:
    significant_predictors.remove('const')  # Remove the constant term

df_final = df_encoded[significant_cat_cols+significant_predictors+[target_col]]

print("DONE...!!!")

Target is Numeric
DONE...!!!


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
# from lightgbm import LGBMRegressor
# from xgboost import XGBRegressor

In [5]:
# Separate features and target
X = df_final.drop(target_col, axis=1)  # Replace with your target column name
y = df_final[target_col]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
hyperparameter_grids = {
    "Linear Regression": {},
    "Ridge Regression": {
        'alpha': [0.1, 1.0, 10.0]
    },
    "Lasso Regression": {
        'alpha': [0.1, 1.0, 10.0]
    },
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    "SVR": {
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.2, 0.5]
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9]
    },
    "LightGBM": {
        'num_leaves': [31, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'max_depth': [-1, 10, 20]
    },
    "XGBoost": {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
}

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    # "LightGBM": LGBMRegressor(),
    # "XGBoost": XGBRegressor(eval_metric='rmse')  # Use RMSE for evaluation
}

In [10]:
best_models = {}
best_rmse = {}

for model_name, model in models.items():
    param_grid = hyperparameter_grids[model_name]

    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    final_rmse = np.sqrt(-grid_search.best_score_)  # Convert negative MSE to RMSE

    best_models[model_name] = best_model
    best_rmse[model_name] = final_rmse
    print(f"Best Model: {model_name}, Best Hyperparameters: {grid_search.best_params_}, Final RMSE: {final_rmse}")

Best Model: Linear Regression, Best Hyperparameters: {}, Final RMSE: 36152.69201329487
Best Model: Ridge Regression, Best Hyperparameters: {'alpha': 1.0}, Final RMSE: 35965.28814310178
Best Model: Lasso Regression, Best Hyperparameters: {'alpha': 10.0}, Final RMSE: 36056.875816777385
Best Model: Decision Tree, Best Hyperparameters: {'max_depth': 20, 'min_samples_split': 10}, Final RMSE: 48718.87620786183
Best Model: Random Forest, Best Hyperparameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}, Final RMSE: 34866.87451213228
Best Model: SVR, Best Hyperparameters: {'C': 10, 'epsilon': 0.1}, Final RMSE: 78915.45563966435
Best Model: KNN, Best Hyperparameters: {'n_neighbors': 9}, Final RMSE: 64013.80168016677


In [11]:
# Identify the best model based on RMSE
best_overall_model_name = min(best_rmse, key=best_rmse.get)
best_overall_rmse = best_rmse[best_overall_model_name]

print("Comparison of RMSE across models:")
for model, rmse in best_rmse.items():
    print(f"{model}: RMSE = {rmse}")

print(f"Best Overall Model: {best_overall_model_name} with RMSE: {best_overall_rmse}")


Comparison of RMSE across models:
Linear Regression: RMSE = 36152.69201329487
Ridge Regression: RMSE = 35965.28814310178
Lasso Regression: RMSE = 36056.875816777385
Decision Tree: RMSE = 48718.87620786183
Random Forest: RMSE = 34866.87451213228
SVR: RMSE = 78915.45563966435
KNN: RMSE = 64013.80168016677
Best Overall Model: Random Forest with RMSE: 34866.87451213228
