In [3]:
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()
from pyspark.sql import SQLContext, SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("Combine CASTNET Files")
         .config("spark.ui.port", "4050")
         .getOrCreate())

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [3]:
import os
import shutil
import sys
import time
import numpy as np
import pandas as pd
from google.colab import drive

drive_root = "/content/drive"
drive.mount(drive_root)

file_directory = os.path.join(drive_root, "My Drive", "AML Group 24")
os.listdir(file_directory)

FILE_OUTPUT = "output"
if not os.path.exists(FILE_OUTPUT):
    os.makedirs(FILE_OUTPUT)

Mounted at /content/drive


In [4]:
os.listdir(file_directory)

['met_gas_site_cleaned.snappy.parquet']

In [5]:
pth = os.path.join(file_directory, "met_gas_site_cleaned.snappy.parquet")
print(pth)

/content/drive/My Drive/AML Group 24/met_gas_site_cleaned.snappy.parquet


In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


#pth = "met_gas_site_cleaned.snappy.parquet"
#pth = os.path.join(file_directory, "met_gas_site_cleaned.snappy.parquet")
#print(pth)
def generate_datasets(pth):

  df = pd.read_parquet(pth)
  df["year"] = df["DATE_TIME"].dt.year
  df["month"] = df["DATE_TIME"].dt.month
  df["LATITUDE"] = df["LATITUDE"].astype("float")
  df["LONGITUDE"] = df["LONGITUDE"].astype("float")

  categorical_columns = ['LAND_USE', 'TERRAIN']
  for feature in categorical_columns:
      encoder = OneHotEncoder()
      encoded_data = encoder.fit_transform(df[[feature]])
      df[encoder.categories_[0]] = encoded_data.toarray()


  drop_columns = ['OZONE', "year", "DATE_TIME", "SITE_ID", "QA_CODE", "WINDSPEED", "SIGMA_THETA"] + categorical_columns
  df_train = df[df['year'].isin([2013, 2014, 2015, 2016, 2017, 2018])]
  X_train, y_train = df_train.drop(columns=drop_columns, axis=1), df_train['OZONE']

  columns = X_train.columns

  numerical_features = ['TEMPERATURE',
                        'RELATIVE_HUMIDITY',
                        'SOLAR_RADIATION',
                        'PRECIPITATION',
                        'WIND_DIRECTION',
                        'FLOW_RATE',
                        'WINDSPEED_SCALAR',
                        'SHELTER_TEMPERATURE',
                        'NO',
                        'NOY',
                        'NOYDIF',
                        'SO2_GA',
                        'LATITUDE',
                        'LONGITUDE',
                        'ELEVATION']

  ct = ColumnTransformer([
          ('Standardize Numerical Features', StandardScaler(), numerical_features)
      ], remainder='passthrough')
  X_train = ct.fit_transform(X_train)

  df_val = df[df['year'].isin([2019, 2020])]
  X_val, y_val = df_val.drop(columns=drop_columns, axis=1), df_val['OZONE']
  X_val = ct.transform(X_val)

  df_test = df[df['year'].isin([2021, 2022])]
  X_test, y_test = df_test.drop(columns=drop_columns, axis=1), df_test['OZONE']
  X_test = ct.transform(X_test)

  return columns, X_train, y_train, X_val, y_val, X_test, y_test

columns, X_train, y_train, X_val, y_val, X_test, y_test = generate_datasets(pth)


In [7]:
X_train

array([[-0.55354385,  0.01109277, -0.67026968, ...,  0.        ,
         1.        ,  0.        ],
       [-0.72978554, -0.20436501, -0.67028495, ...,  0.        ,
         1.        ,  0.        ],
       [-0.87781006, -0.14531695, -0.63601419, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.46697849,  0.06563334, -0.03849923, ...,  0.        ,
         0.        ,  1.        ],
       [-1.47168136,  0.29100759, -0.67208265, ...,  0.        ,
         0.        ,  1.        ],
       [-1.78777538,  0.47130698, -0.66826588, ...,  0.        ,
         0.        ,  1.        ]])

In [8]:
columns

Index(['TEMPERATURE', 'RELATIVE_HUMIDITY', 'SOLAR_RADIATION', 'PRECIPITATION',
       'WIND_DIRECTION', 'FLOW_RATE', 'WINDSPEED_SCALAR',
       'SHELTER_TEMPERATURE', 'NO', 'NOY', 'NOYDIF', 'SO2_GA', 'LATITUDE',
       'LONGITUDE', 'ELEVATION', 'month', 'Agric', 'Forest', 'Range',
       'Complex', 'Flat', 'Rolling'],
      dtype='object')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pandas as pd


In [10]:
#code to print out the r2 scores of each set - dev, test, validation
def evaluate_model(model):
  train_predict = model.predict(X_train)
  val_predict = model.predict(X_val)
  test_predict = model.predict(X_test)
  print('R2 score for train set:', r2_score(y_train, train_predict))
  print('R2 score for validation set:', r2_score(y_val, val_predict))
  print('R2 score for test set:', r2_score(y_test, test_predict))


In [None]:
#random forest regressor with with no hyperparameters
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
evaluate_model(rf_model)


R2 score for train set: 0.9860152939223009
R2 score for validation set: 0.76111346865673
R2 score for test set: 0.7521771835967892


In [None]:
import numpy as np

# Define ranges for hyperparameters
param_ranges = {
    'n_estimators': np.linspace(10, 200, 5, dtype=int),  # Adjust the range as needed
    'max_depth': np.linspace(5, 30, 5, dtype=int),
    'min_samples_leaf': np.linspace(1, 10, 5, dtype=int),
    'max_features': ['sqrt', 'log2']
}

# Print the defined ranges
for param, values in param_ranges.items():
    print(f"{param}: {values}")

n_estimators: [ 10  57 105 152 200]
max_depth: [ 5 11 17 23 30]
min_samples_leaf: [ 1  3  5  7 10]
max_features: ['sqrt', 'log2']


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Use cross-validation with time split
CV_model = RandomizedSearchCV(estimator=rf_model,
                         param_distributions=param_ranges,
                         cv=TimeSeriesSplit(n_splits=2),
                         verbose=1,
                         scoring='r2',
                         n_jobs=-1,
                         random_state=42,
                         n_iter=25)

CV_model.fit(X_train, y_train)
print(CV_model.best_score_, CV_model.best_params_)

CV_optim = CV_model.best_estimator_
CV_optim.fit(X_train, y_train)
evaluate_model(CV_optim)

Fitting 2 folds for each of 25 candidates, totalling 50 fits




0.8756135262972915 {'n_estimators': 105, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 23}
R2 score for train set: 0.9758119702093068
R2 score for validation set: 0.7758524014847369
R2 score for test set: 0.7767748248161392


In [None]:
# Get feature importances from the best model
feature_importances = CV_optim.feature_importances_

# Create a DataFrame to display feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance DataFrame
print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
                Feature  Importance
1     RELATIVE_HUMIDITY    0.257060
0           TEMPERATURE    0.092099
15                month    0.081763
2       SOLAR_RADIATION    0.078652
9                   NOY    0.060989
14            ELEVATION    0.052237
5             FLOW_RATE    0.049481
10               NOYDIF    0.047163
6      WINDSPEED_SCALAR    0.041137
8                    NO    0.040894
17               Forest    0.039673
12             LATITUDE    0.030395
7   SHELTER_TEMPERATURE    0.026539
4        WIND_DIRECTION    0.025643
13            LONGITUDE    0.023783
11               SO2_GA    0.020390
20                 Flat    0.006803
3         PRECIPITATION    0.005879
16                Agric    0.005786
21              Rolling    0.005290
19              Complex    0.004608
18                Range    0.003736


In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2