In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np

In [2]:
# Load the data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
train_data.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.4,33.400002,...,49.839714,-78.342701,34.296977,,,,,,,
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,...,29.181258,-73.896588,30.545446,,,,,,,
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,...,0.797294,-68.61248,26.899694,,,,,,,


In [4]:
# Feature Engineering
# Here, we can extract date-related features from 'Date' column, for example, day, month, year
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['day'] = train_data['Date'].dt.day
train_data['month'] = train_data['Date'].dt.month
train_data['year'] = train_data['Date'].dt.year

In [5]:
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data['day'] = test_data['Date'].dt.day
test_data['month'] = test_data['Date'].dt.month
test_data['year'] = test_data['Date'].dt.year

In [6]:
# Data Preprocessing

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Exclude target-related columns when selecting numeric columns
numeric_columns = train_data.select_dtypes(include=['number']).drop(columns=['target', 'target_min', 'target_max', 'target_variance', 'target_count']).columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Handle missing values in numeric columns
numeric_imputer = SimpleImputer(strategy='mean')
train_data_numeric = pd.DataFrame(numeric_imputer.fit_transform(train_data[numeric_columns]), columns=numeric_columns)
test_data_numeric = pd.DataFrame(numeric_imputer.transform(test_data[numeric_columns]), columns=numeric_columns)

# Encode categorical variables
categorical_encoder = OneHotEncoder(handle_unknown='ignore')
train_data_encoded = pd.DataFrame(categorical_encoder.fit_transform(train_data[categorical_columns]).toarray(),
                                  columns=categorical_encoder.get_feature_names_out(categorical_columns))
test_data_encoded = pd.DataFrame(categorical_encoder.transform(test_data[categorical_columns]).toarray(),
                                 columns=categorical_encoder.get_feature_names_out(categorical_columns))

# Concatenate numeric and encoded categorical columns
train_data_processed = pd.concat([train_data_numeric, train_data_encoded], axis=1)
test_data_processed = pd.concat([test_data_numeric, test_data_encoded], axis=1)


In [8]:
# Encode categorical variables
# For simplicity, let's use one-hot encoding for 'Place_ID' column
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Place_ID'])
    ])

In [9]:
# Feature Selection/Dimensionality Reduction
# We'll use PCA to reduce dimensionality
pca = PCA(n_components=10)

In [10]:
# Model Selection and Training
# Let's train a RandomForestRegressor and a GradientBoostingRegressor
rf_regressor = RandomForestRegressor()
gb_regressor = GradientBoostingRegressor()

In [15]:
train_data_processed.head()

Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,Place_ID_YAQHNNY,Place_ID_YAXBMZ6,Place_ID_YCXA4V5,Place_ID_YDW4K0H,Place_ID_YJENTFL,Place_ID_YLLOKEY,Place_ID_YLZOBFW,Place_ID_YPXSK14,Place_ID_YSIXKFZ,Place_ID_YWSFY6Q
0,11.0,60.200001,0.00804,18.51684,1.996377,-1.227395,7.4e-05,0.000156,-1.23133,0.006507,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14.6,48.799999,0.00839,22.546533,3.33043,-1.188108,7.6e-05,0.000197,-1.082553,0.01836,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16.4,33.400002,0.0075,27.03103,5.065727,3.500559,6.7e-05,0.00017,-1.001242,0.015904,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.911948,21.300001,0.00391,23.971857,3.004001,1.099468,8.3e-05,0.000175,-0.777019,0.055765,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13.900001,44.700001,0.00535,16.816309,2.621787,2.670559,7e-05,0.000142,0.366323,0.02853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Utilize cross-validation to evaluate model performance and tune hyperparameters
# For simplicity, we'll use cross-validation on the entire dataset
scaler = StandardScaler()
X_train = preprocessor.fit_transform(train_data.drop(columns=['Date', 'target']))
X_train_scaled = scaler.fit_transform(X_train)
y_train = train_data_imputed['target']

In [None]:
















# RandomForestRegressor with cross-validation
rf_scores = cross_val_score(rf_regressor, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
rf_rmse_scores = np.sqrt(-rf_scores)
print("Random Forest RMSE: ", rf_rmse_scores.mean())

# GradientBoostingRegressor with cross-validation
gb_scores = cross_val_score(gb_regressor, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
gb_rmse_scores = np.sqrt(-gb_scores)
print("Gradient Boosting RMSE: ", gb_rmse_scores.mean())

# Model Evaluation
# Evaluate models using appropriate evaluation metrics
# For simplicity, let's use RMSE for evaluation
X_train_scaled = scaler.transform(X_train)
rf_regressor.fit(X_train_scaled, y_train)
y_train_pred_rf = rf_regressor.predict(X_train_scaled)
rf_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
rf_mae = mean_absolute_error(y_train, y_train_pred_rf)
rf_r2 = r2_score(y_train, y_train_pred_rf)
print("Random Forest RMSE on training set: ", rf_rmse)
print("Random Forest MAE on training set: ", rf_mae)
print("Random Forest R-squared on training set: ", rf_r2)

# GradientBoostingRegressor
X_train_scaled = scaler.transform(X_train)
gb_regressor.fit(X_train_scaled, y_train)
y_train_pred_gb = gb_regressor.predict(X_train_scaled)
gb_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_gb))
gb_mae = mean_absolute_error(y_train, y_train_pred_gb)
gb_r2 = r2_score(y_train, y_train_pred_gb)
print("Gradient Boosting RMSE on training set: ", gb_rmse)
print("Gradient Boosting MAE on training set: ", gb_mae)
print("Gradient Boosting R-squared on training set: ", gb_r2)

# Ensemble
ensemble_regressor = VotingRegressor([('rf', rf_regressor), ('gb', gb_regressor)])
ensemble_regressor.fit(X_train_scaled, y_train)
y_train_pred_ensemble = ensemble_regressor.predict(X_train_scaled)
ensemble_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_ensemble))
ensemble_mae = mean_absolute_error(y_train, y_train_pred_ensemble)
ensemble_r2 = r2_score(y_train, y_train_pred_ensemble)
print("Ensemble RMSE on training set: ", ensemble_rmse)
print("Ensemble MAE on training set: ", ensemble_mae)
print("Ensemble R-squared on training set: ", ensemble_r2)

# Prediction
# Make predictions on the test dataset using the trained model
X_test = preprocessor.transform(test_data_imputed.drop(columns=['Date']))
X_test_scaled = scaler.transform(X_test)

# Predictions using the ensemble model
test_predictions = ensemble_regressor.predict(X_test_scaled)

# Submission
submission_df = pd.DataFrame({'Place_ID X Date': test_data['Place_ID X Date'], 'target': test_predictions})
submission_df.to_csv('submission.csv', index=False)
