In [1]:
# import split sets

import pandas as pd
import matplotlib
from matplotlib import pylab as plt
import numpy as np

X_train = pd.read_csv("split_data/X_train.csv")
X_val = pd.read_csv("split_data/X_val.csv")
X_test = pd.read_csv("split_data/X_test.csv")

y_train = pd.read_csv("split_data/y_train.csv")
y_val = pd.read_csv("split_data/y_val.csv")
y_test = pd.read_csv("split_data/y_test.csv")

# verify shapes
print("Shape of X train:", X_train.shape)
print("Shape of y train:", y_train.shape)
print("Shape of X val:", X_val.shape)
print("Shape of y val:", y_val.shape)
print("Shape of X test:", X_test.shape)
print("Shape of y test:", y_test.shape)

print(X_train.head())


Shape of X train: (4910, 7)
Shape of y train: (4910, 2)
Shape of X val: (1637, 7)
Shape of y val: (1637, 2)
Shape of X test: (1637, 7)
Shape of y test: (1637, 2)
   Unnamed: 0  no_wrk_aux  no2_wrk_aux  o3_wrk_aux       temp         rh  \
0        2302    0.050299     0.001970    0.036247  36.537770  15.796436   
1         835    0.028139     0.021041    0.033938   7.424897  53.726745   
2        4024    0.041295     0.005716    0.020930  27.725586  53.255664   
3         778    0.029331     0.015959    0.033618  -1.611770  15.023979   
4        2676    0.029299     0.016341    0.035075  16.615753  75.401336   

   t_since_depl  
0          2458  
1           862  
2          4326  
3           805  
4          2873  


In [2]:
# Delete the index column before feature engineering
X_train = X_train.drop(X_train.columns[0], axis=1)
X_val = X_val.drop(X_val.columns[0], axis=1)
X_test = X_test.drop(X_test.columns[0], axis=1)

y_train = y_train.drop(y_train.columns[0], axis=1)
y_val = y_val.drop(y_val.columns[0], axis=1)
y_test = y_test.drop(y_test.columns[0], axis=1)

print(X_train.head())


   no_wrk_aux  no2_wrk_aux  o3_wrk_aux       temp         rh  t_since_depl
0    0.050299     0.001970    0.036247  36.537770  15.796436          2458
1    0.028139     0.021041    0.033938   7.424897  53.726745           862
2    0.041295     0.005716    0.020930  27.725586  53.255664          4326
3    0.029331     0.015959    0.033618  -1.611770  15.023979           805
4    0.029299     0.016341    0.035075  16.615753  75.401336          2873


In [3]:
# Impute missing values (replace when we learn more advanced methods!)
from sklearn.impute import SimpleImputer

# Median imputation
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)

# Transform the training data (impute missing values)
X_train_imputed = imputer.transform(X_train)

# Transform the validation and test sets using the same imputer
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_val_imputed_df = pd.DataFrame(X_val_imputed, columns=X_val.columns)
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=X_test.columns)

# reduce the columns again
X_train_imputed_df = X_train_imputed_df[['no_wrk_aux','no2_wrk_aux','o3_wrk_aux','temp','rh','t_since_depl']]
X_val_imputed_df = X_val_imputed_df[['no_wrk_aux','no2_wrk_aux','o3_wrk_aux','temp','rh','t_since_depl']]
X_test_imputed_df = X_test_imputed_df[['no_wrk_aux','no2_wrk_aux','o3_wrk_aux','temp','rh','t_since_depl']]

# Output the results
print("X_train after imputation:\n", X_train_imputed_df)
#print("X_val after imputation:\n", X_val_imputed_df)
#print("X_test after imputation:\n", X_test_imputed_df)




X_train after imputation:
       no_wrk_aux  no2_wrk_aux  o3_wrk_aux       temp         rh  t_since_depl
0       0.050299     0.001970    0.036247  36.537770  15.796436        2458.0
1       0.028139     0.021041    0.033938   7.424897  53.726745         862.0
2       0.041295     0.005716    0.020930  27.725586  53.255664        4326.0
3       0.029331     0.015959    0.033618  -1.611770  15.023979         805.0
4       0.029299     0.016341    0.035075  16.615753  75.401336        2873.0
...          ...          ...         ...        ...        ...           ...
4905    0.030713     0.015821    0.039910  26.719155  65.944951        4246.0
4906    0.031353     0.015692    0.039925  26.781807  66.352556        5254.0
4907    0.029899     0.016538    0.032468  16.780872  51.063011        8149.0
4908    0.030241     0.018904    0.031023 -11.313564  21.136239         811.0
4909    0.034011     0.009700    0.024887  22.239298  58.052576        4038.0

[4910 rows x 6 columns]


In [4]:
# Automatic feature engineering - happens BEFORE scaling, to get same mean and sd at end

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(6, include_bias=False, interaction_only=True)

X_train_engn = poly.fit_transform(X_train_imputed_df) 
X_train_engn_df = pd.DataFrame(X_train_engn, columns=poly.get_feature_names_out(X_train.columns))

X_val_engn = poly.fit_transform(X_val_imputed_df)
X_val_engn_df = pd.DataFrame(X_val_engn, columns=poly.get_feature_names_out(X_val.columns))

X_test_engn = poly.fit_transform(X_test_imputed_df) 
X_test_engn_df = pd.DataFrame(X_test_engn, columns=poly.get_feature_names_out(X_test.columns)) 

#print(X_val_engn_df)
print(X_val_engn_df.columns)
print(X_val_engn_df.shape)

Index(['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh', 't_since_depl',
       'no_wrk_aux no2_wrk_aux', 'no_wrk_aux o3_wrk_aux', 'no_wrk_aux temp',
       'no_wrk_aux rh', 'no_wrk_aux t_since_depl', 'no2_wrk_aux o3_wrk_aux',
       'no2_wrk_aux temp', 'no2_wrk_aux rh', 'no2_wrk_aux t_since_depl',
       'o3_wrk_aux temp', 'o3_wrk_aux rh', 'o3_wrk_aux t_since_depl',
       'temp rh', 'temp t_since_depl', 'rh t_since_depl',
       'no_wrk_aux no2_wrk_aux o3_wrk_aux', 'no_wrk_aux no2_wrk_aux temp',
       'no_wrk_aux no2_wrk_aux rh', 'no_wrk_aux no2_wrk_aux t_since_depl',
       'no_wrk_aux o3_wrk_aux temp', 'no_wrk_aux o3_wrk_aux rh',
       'no_wrk_aux o3_wrk_aux t_since_depl', 'no_wrk_aux temp rh',
       'no_wrk_aux temp t_since_depl', 'no_wrk_aux rh t_since_depl',
       'no2_wrk_aux o3_wrk_aux temp', 'no2_wrk_aux o3_wrk_aux rh',
       'no2_wrk_aux o3_wrk_aux t_since_depl', 'no2_wrk_aux temp rh',
       'no2_wrk_aux temp t_since_depl', 'no2_wrk_aux rh t_since_depl',
       

In [5]:
# Scaling data using standard scaler
from sklearn.preprocessing import StandardScaler
import statistics

scaler = StandardScaler()

X_train_engn_scaled = scaler.fit_transform(X_train_engn_df)
X_val_engn_scaled = scaler.transform(X_val_engn_df)
X_test_engn_scaled = scaler.transform(X_test_engn_df)

# convert back to DataFrame 
X_train_engn_scaled_df = pd.DataFrame(X_train_engn_scaled, columns=X_train_engn_df.columns)
X_val_engn_scaled_df = pd.DataFrame(X_val_engn_scaled, columns=X_val_engn_df.columns)
X_test_engn_scaled_df = pd.DataFrame(X_test_engn_scaled, columns=X_test_engn_df.columns)

print(X_train_engn_scaled_df.shape)
print(X_val_engn_scaled_df.shape)
print(X_test_engn_scaled_df.shape)

print("LCS NO2 mean:",np.mean(X_train_engn_scaled_df['no2_wrk_aux']))
print("LCS NO2 standard deviation:",np.std(X_train_engn_scaled_df['no2_wrk_aux']))



(4910, 63)
(1637, 63)
(1637, 63)
LCS NO2 mean: 1.0983746159713158e-15
LCS NO2 standard deviation: 1.0


In [6]:
# Save preprocessed data

X_train_engn_scaled_df.to_csv("preprocessed_data/X_train_prepro.csv",index=False)
X_val_engn_scaled_df.to_csv("preprocessed_data/X_val_prepro.csv",index=False)
X_test_engn_scaled_df.to_csv("preprocessed_data/X_test_prepro.csv",index=False)
