In [22]:
# import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline

In [23]:
# suppress warnings
warnings.filterwarnings('ignore')   

#### Data

In [24]:
# train data
df_train = pd.read_csv('Train.csv')
df_train.head(2)

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652


In [25]:
# test data
df_test = pd.read_csv('Test.csv')
df_test.head(2)

Unnamed: 0,Place_ID X Date,Date,Place_ID,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,0OS9LVX X 2020-01-02,2020-01-02,0OS9LVX,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,...,1.445658,-95.984984,22.942019,,,,,,,
1,0OS9LVX X 2020-01-03,2020-01-03,0OS9LVX,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,...,34.641758,-95.014908,18.539116,,,,,,,


##### Characteristics

In [26]:
# Filter columns with missing values
columns_with_missing_values_train = df_train.columns[df_train.isnull().any()]
columns_with_missing_values_test = df_train.columns[df_train.isnull().any()]

In [27]:
# Fill missing values in float-type columns with their mean
for column in columns_with_missing_values_train:
    if df_train[column].dtype == 'float64':  # Check if column is float-type
        df_train[column].fillna(df_train[column].mean(), inplace=True)

In [28]:
for column in columns_with_missing_values_test:
    if df_test[column].dtype == 'float64':  # Check if column is float-type
        df_test[column].fillna(df_test[column].mean(), inplace=True)

In [29]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [30]:
# Data Preprocessing
place_id = df_train['Place_ID']

# Separate features and target variable
X_train = df_train.drop(columns=['target', 'Place_ID X Date', 'Place_ID', 'target_min', 'target_max', 'target_variance'])
y_train = df_train['target']

In [31]:
# Separate features for test data
X_test = df_test.drop(columns=['Place_ID X Date', 'Place_ID'])

In [32]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X_train.select_dtypes(include=['float64']).columns)
    ])

In [33]:
# Define the model
#linreg_model = Pipeline(steps=[
#    ('preprocessor', preprocessor),
#    ('scaler', StandardScaler()),
#    ('regressor', LinearRegression())
#])

In [34]:
# Define the model
#random_forest_model = Pipeline(steps=[
#    ('preprocessor', preprocessor),
#    ('scaler', StandardScaler()),
#    ('regressor', RandomForestRegressor(random_state=42))
#])

In [35]:
# Define the model
#from xgboost import XGBRegressor
#xg_boost_model = Pipeline(steps=[
#    ('preprocessor', preprocessor),
#    ('scaler', StandardScaler()),
#    ('XGBoost Regressor', XGBRegressor(random_state=42))
#])

In [36]:
# Define the model
from catboost import CatBoostRegressor
cat_boost_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('CatBoost Regressor', CatBoostRegressor(random_state=42, verbose=False))
])

In [37]:
# Train the model
#linreg_model.fit(X_train, y_train)

In [38]:
# Train the model
#random_forest_model.fit(X_train, y_train)

In [39]:
# Train the model
#xg_boost_model.fit(X_train, y_train)

In [40]:
# Train the model
cat_boost_model.fit(X_train, y_train)

In [41]:
# Predict on the test set
# linear regression
#y_pred_test = linreg_model.predict(X_test)

# random forest
#y_pred_test = random_forest_model.predict(X_test)

# xg boost
#y_pred_test = xg_boost_model.predict(X_test)

In [42]:
# Predict on the test set
y_pred_test = cat_boost_model.predict(X_test)

In [43]:
# Prepare submission file
submission_df = pd.read_csv('SampleSubmission.csv')
submission_df['target'] = y_pred_test
submission_df.to_csv('submission.csv', index=False)
print('submission csv successfully downloaded')

submission csv successfully downloaded


In [None]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
linreg_model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_pred_val = linreg_model.predict(X_val_split)

# Evaluate the model
mse = mean_squared_error(y_val_split, y_pred_val)
print("Mean Squared Error on Validation Set:", mse)

# Check MSE and print corresponding message
if mse < 1000:
    print("Model is Great")
elif 1000 <= mse < 2000:
    print("Model is Good")
elif 2000 <= mse < 3000:
    print("Model is Average")
elif 3000 <= mse < 4000:
    print("Model is Bad")
else:
    print("Model is Terrible")

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
# Define the models
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest Regressor', RandomForestRegressor(random_state=42)),
    ('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42))
]

In [None]:
# Loop through each model
for name, model in models:
    print(f"Training {name}...")
    
    # Define the pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])

    # Train the model
    model_pipeline.fit(X_train_split, y_train_split)

    # Predict on the validation set
    y_pred_val = model_pipeline.predict(X_val_split)

    # Evaluate the model
    mse = mean_squared_error(y_val_split, y_pred_val)
    print(f"Mean Squared Error on Validation Set ({name}): {mse}")

    # Check MSE and print corresponding message
    if mse < 1000:
        print("Model is Great")
    elif 1000 <= mse < 2000:
        print("Model is Good")
    elif 2000 <= mse < 3000:
        print("Model is Average")
    elif 3000 <= mse < 4000:
        print("Model is Bad")
    else:
        print("Model is Terrible")

In [None]:
# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# extract month from date column
X_train['Month'] = pd.to_datetime(X_train['Date']).dt.month
X_val['Month'] = pd.to_datetime(X_val['Date']).dt.month

In [None]:
# Drop non-numeric columns and the original date column
X_train_numeric = X_train.drop(columns=['Date'])
X_val_numeric = X_val.drop(columns=['Date'])

In [None]:
# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_numeric)
X_val_imputed = imputer.transform(X_val_numeric)

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

In [None]:
# Linear Regression
# Initialize and train the linear regression model
linreg_model = LinearRegression()
linreg_model.fit(X_train_scaled, y_train)

In [None]:
# Predict on the validation set
y_pred = linreg_model.predict(X_val_scaled)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)

In [None]:
# Perform k-fold cross-validation
mse_scores = cross_val_score(linreg_model, X_train_scaled, y, cv=5, scoring='neg_mean_squared_error')
mean_mse = -mse_scores.mean()

print("Mean Squared Error (Cross-Validated):", mean_mse)