In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline

In [2]:
# suppress warnings
warnings.filterwarnings('ignore')   

#### Data

In [3]:
# train data
df_train = pd.read_csv('Train.csv')
df_train.head(2)

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652


In [4]:
# test data
df_test = pd.read_csv('Test.csv')
df_test.head(2)

Unnamed: 0,Place_ID X Date,Date,Place_ID,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,0OS9LVX X 2020-01-02,2020-01-02,0OS9LVX,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,...,1.445658,-95.984984,22.942019,,,,,,,
1,0OS9LVX X 2020-01-03,2020-01-03,0OS9LVX,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,...,34.641758,-95.014908,18.539116,,,,,,,


##### Characteristics

In [5]:
# Filter columns with missing values
columns_with_missing_values_train = df_train.columns[df_train.isnull().any()]
columns_with_missing_values_test = df_train.columns[df_train.isnull().any()]

In [6]:
# Fill missing values in float-type columns with their mean
for column in columns_with_missing_values_train:
    if df_train[column].dtype == 'float64':  # Check if column is float-type
        df_train[column].fillna(df_train[column].mean(), inplace=True)

In [7]:
for column in columns_with_missing_values_test:
    if df_test[column].dtype == 'float64':  # Check if column is float-type
        df_test[column].fillna(df_test[column].mean(), inplace=True)

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
# Data Preprocessing
place_id = df_train['Place_ID']

# Separate features and target variable
X_train = df_train.drop(columns=['target', 'Place_ID X Date', 'Place_ID', 'target_min', 'target_max', 'target_variance'])
y_train = df_train['target']

In [10]:
# Separate features for test data
X_test = df_test.drop(columns=['Place_ID X Date', 'Place_ID'])

In [11]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X_train.select_dtypes(include=['float64']).columns)
    ])

In [12]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [15]:
# Define the models along with their hyperparameters
models = [
    ('Linear Regression', LinearRegression(), {}),
    ('Ridge Regression', Ridge(), {'regressor__alpha': [0.1, 1, 10]}),
    ('Lasso Regression', Lasso(), {'regressor__alpha': [0.1, 1, 10]}),
    ('CatBoost Regressor', CatBoostRegressor(random_state=42, verbose=False), {'regressor__iterations': [50, 100, 150]}),
    ('ElasticNet Regression', ElasticNet(), {'regressor__alpha': [0.1, 1, 10], 'regressor__l1_ratio': [0.1, 0.5, 0.9]}),
    ('KNN Regressor', KNeighborsRegressor(), {'regressor__n_neighbors': [3, 5, 7]}),
    ('XGBoost Regressor', XGBRegressor(random_state=42), {'regressor__n_estimators': [50, 100, 150]}),
    ('LightGBM Regressor', LGBMRegressor(random_state=42), {'regressor__n_estimators': [50, 100, 150]}),
    ('SVR', SVR(), {'regressor__C': [0.1, 1, 10], 'regressor__kernel': ['linear', 'rbf']}),
    ('Random Forest Regressor', RandomForestRegressor(random_state=42), {'regressor__n_estimators': [50, 100, 150]}),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42), {'regressor__n_estimators': [50, 100, 150]}),
]

In [16]:
# Loop through each model
for name, model, param_grid in models:
    print(f"Training {name}...")
    
    # Define the pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_split, y_train_split)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict on the validation set
    y_pred_val = best_model.predict(X_val_split)

    # Evaluate the model
    mse = mean_squared_error(y_val_split, y_pred_val)
    print(f"Mean Squared Error on Validation Set ({name}): {mse}")

    # Check MSE and print corresponding message
    if mse < 1000:
        print("Model is Great")
    elif 1000 <= mse < 2000:
        print("Model is Good")
    elif 2000 <= mse < 3000:
        print("Model is Average")
    elif 3000 <= mse < 4000:
        print("Model is Bad")
    else:
        print("Model is Terrible")

Training Linear Regression...
Mean Squared Error on Validation Set (Linear Regression): 1399.976889748849
Model is Good
Training Ridge Regression...
Mean Squared Error on Validation Set (Ridge Regression): 1400.0761131536665
Model is Good
Training Lasso Regression...
Mean Squared Error on Validation Set (Lasso Regression): 1409.3534681018916
Model is Good
Training CatBoost Regressor...
Mean Squared Error on Validation Set (CatBoost Regressor): 915.205161499281
Model is Great
Training ElasticNet Regression...
Mean Squared Error on Validation Set (ElasticNet Regression): 1411.0842957037207
Model is Good
Training KNN Regressor...
Mean Squared Error on Validation Set (KNN Regressor): 1244.9406136806815
Model is Good
Training XGBoost Regressor...
Mean Squared Error on Validation Set (XGBoost Regressor): 904.9931575896635
Model is Great
Training LightGBM Regressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022548 seconds.
You can set `force_col