In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

In [2]:
df_feature = pd.read_csv('~data/feature_extraction.csv')

In [3]:
riskforecasting = pd.read_csv('~data/pollution_risk_forecasting.csv')
riskforecasting = riskforecasting[['site','time','warning','riskLevelLabel']]
riskforecasting['time'] = pd.to_datetime(riskforecasting.time).dt.strftime('%Y-%m-%d')
riskforecasting

Unnamed: 0,site,time,warning,riskLevelLabel
0,Seaton Carew North,2022-04-28,Pollution RIsk Forecasts will start soon,normal
1,Seaton Carew North,2022-04-29,Pollution RIsk Forecasts will start soon,normal
2,Seaton Carew North,2022-04-30,Pollution RIsk Forecasts will start soon,normal
3,Seaton Carew North,2022-05-04,No warnings in place,normal
4,Seaton Carew North,2022-05-05,No warnings in place,normal
...,...,...,...,...
63641,Westward Ho!,2022-09-26,No pollution incidents reported,normal
63642,Westward Ho!,2022-09-27,No pollution incidents reported,normal
63643,Westward Ho!,2022-09-28,No pollution incidents reported,normal
63644,Westward Ho!,2022-09-29,No pollution incidents reported,normal


In [4]:
df_merged = pd.merge(df_feature, riskforecasting[['site', 'time', 'riskLevelLabel']], on=['site', 'time'])
df_merged

Unnamed: 0,time,site,TUR_1x1_median,SPM_1x1_median,CHL_1x1_median,TUR_1x1_mean,SPM_1x1_mean,CHL_1x1_mean,TUR_1x1_q1,SPM_1x1_q1,...,TUR_11x11_mean,SPM_11x11_mean,CHL_11x11_mean,TUR_11x11_q1,SPM_11x11_q1,CHL_11x11_q1,TUR_11x11_q3,SPM_11x11_q3,CHL_11x11_q3,riskLevelLabel
0,2022-03-01,Saltburn,,,,,,,,,...,,,,,,,,,,increased
1,2022-03-15,Frinton,,,,,,,,,...,,,,,,,,,,increased
2,2022-03-15,Holland,,,,,,,,,...,,,,,,,,,,increased
3,2022-03-17,Frinton,121.957180,124.314230,9.526196,121.957180,124.314230,9.526196,121.957180,124.314230,...,119.602328,123.807567,8.214204,88.586640,99.837202,7.366510,141.179335,146.104910,9.199348,normal
4,2022-03-17,Holland,,,,,,,,,...,100.306580,120.473114,8.594822,87.195181,106.791605,5.838983,110.272972,130.367225,10.113032,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22676,2022-09-30,Wilsthorpe,,,,,,,,,...,,,,,,,,,,normal
22677,2022-09-30,Winchelsea,41.281174,25.715336,6.568486,41.281174,25.715336,6.568486,41.281174,25.715336,...,23.628239,15.012895,4.302087,14.766009,8.285604,3.285116,30.627869,18.688794,4.585260,normal
22678,2022-09-30,Withernsea,,,,,,,,,...,,,,,,,,,,normal
22679,2022-09-30,Worthing,,,,,,,,,...,,,,,,,,,,normal


## Random Forest

In [5]:
df_dropNaN = df_merged.dropna(subset=["TUR_1x1_median", "TUR_1x1_mean", "TUR_1x1_q1", "TUR_1x1_q3", 
      "TUR_3x3_median", "TUR_3x3_mean", "TUR_3x3_q1", "TUR_3x3_q3", 
      "TUR_5x5_median", "TUR_5x5_mean", "TUR_5x5_q1", "TUR_5x5_q3", 
      "TUR_7x7_median", "TUR_7x7_mean", "TUR_7x7_q1", "TUR_7x7_q3", 
      "TUR_9x9_median", "TUR_9x9_mean", "TUR_9x9_q1", "TUR_9x9_q3", 
      "TUR_11x11_median", "TUR_11x11_mean", "TUR_11x11_q1", "TUR_11x11_q3",
      "SPM_1x1_median", "SPM_1x1_mean", "SPM_1x1_q1", "SPM_1x1_q3", 
      "SPM_3x3_median", "SPM_3x3_mean", "SPM_3x3_q1", "SPM_3x3_q3", 
      "SPM_5x5_median", "SPM_5x5_mean", "SPM_5x5_q1", "SPM_5x5_q3", 
      "SPM_7x7_median", "SPM_7x7_mean", "SPM_7x7_q1", "SPM_7x7_q3", 
      "SPM_9x9_median", "SPM_9x9_mean", "SPM_9x9_q1", "SPM_9x9_q3", 
      "SPM_11x11_median", "SPM_11x11_mean", "SPM_11x11_q1", "SPM_11x11_q3", 
      "CHL_1x1_median", "CHL_1x1_mean", "CHL_1x1_q1", "CHL_1x1_q3",
      "CHL_3x3_median", "CHL_3x3_mean", "CHL_3x3_q1", "CHL_3x3_q3", 
      "CHL_5x5_median", "CHL_5x5_mean", "CHL_5x5_q1", "CHL_5x5_q3",        
      "CHL_7x7_median", "CHL_7x7_mean", "CHL_7x7_q1", "CHL_7x7_q3",        
      "CHL_9x9_median", "CHL_9x9_mean", "CHL_9x9_q1", "CHL_9x9_q3",        
      "CHL_11x11_median", "CHL_11x11_mean", "CHL_11x11_q1", "CHL_11x11_q3"])

X = df_dropNaN.iloc[:, 2:-1]
y = df_dropNaN.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)

Training X Shape: (1814, 72)
Training y Shape: (1814,)
Testing X Shape: (454, 72)
Testing y Shape: (454,)


In [6]:
# Fit a random forest classifier to the data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the performance of the classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9779735682819384


### importance of feature aggregation

In [7]:
importance_dict = dict(zip(X.columns, rf.feature_importances_))

features = ['11x11', '9x9', '7x7', '5x5', '3x3', '1x1', 'TUR', 'SPM', 'CHL']

data = []
for s in features:
    sum_importance = np.sum([importance for feature, importance in importance_dict.items() if s in feature])
    data.append([s, sum_importance])

# Create a DataFrame with the calculated data
df_importances = pd.DataFrame(data, columns=['Feature', 'Sum of Importances'])

def highlight_max(s):
    if len(s) > 3:
        return ['background-color: #DEB887' if v in list(s[np.argsort(s)[-3:]]) else '' for v in s]
    else:
        return ['background-color: #8FBC8F' if v == s.max() else '' for v in s]

df_importances[:6].style.apply(highlight_max, subset=['Sum of Importances'])

Unnamed: 0,Feature,Sum of Importances
0,11x11,0.172817
1,9x9,0.153895
2,7x7,0.162149
3,5x5,0.151036
4,3x3,0.176557
5,1x1,0.356363


In [8]:
df_importances[-3:].style.apply(highlight_max, subset=['Sum of Importances'])

Unnamed: 0,Feature,Sum of Importances
6,TUR,0.327729
7,SPM,0.310229
8,CHL,0.362042


In [9]:
from operator import add
new_col_names = list(set(["_".join(col.split("_")[:2]) for col in df_dropNaN.columns]))

col_dict = {}
for col in df_dropNaN:
    prefix = "_".join(col.split("_")[:2])
    for agg in new_col_names:
        if prefix == agg:
            if prefix in col_dict:
                col_dict[prefix] = list(map(add, col_dict[prefix], df_dropNaN[col]))
            else:
                col_dict[prefix] = df_dropNaN[col]
        else: col_dict[col] = df_dropNaN[col]
        
# Create a new dataframe from the dictionary
new_df = pd.DataFrame(col_dict)
new_df = new_df[['time', 'site', 'CHL_1x1', 'CHL_3x3', 'CHL_5x5', 'CHL_7x7', 'CHL_9x9',
                 'CHL_11x11', 'SPM_1x1', 'SPM_3x3', 'SPM_5x5', 'SPM_7x7', 'SPM_9x9',
                 'SPM_11x11', 'TUR_1x1', 'TUR_3x3', 'TUR_5x5', 'TUR_7x7', 'TUR_9x9',
                 'TUR_11x11', 'riskLevelLabel']]
new_df

Unnamed: 0,time,site,CHL_1x1,CHL_3x3,CHL_5x5,CHL_7x7,CHL_9x9,CHL_11x11,SPM_1x1,SPM_3x3,...,SPM_7x7,SPM_9x9,SPM_11x11,TUR_1x1,TUR_3x3,TUR_5x5,TUR_7x7,TUR_9x9,TUR_11x11,riskLevelLabel
3,2022-03-17,FrintonFrinton,38.104786,38.740747,37.023345,35.326456,34.678608,33.421274,497.256920,637.281440,...,517.658429,499.172746,484.125069,487.828720,558.623636,547.163125,498.291624,476.351049,453.044422,normal
21,2022-04-28,BrightlingseaBrightlingsea,33.879484,35.571056,37.594320,36.724657,35.881442,34.036643,147.816420,162.819754,...,114.132612,105.211252,106.128902,211.221960,227.066708,190.406884,168.280413,157.634456,156.819759,normal
35,2022-04-28,CromerCromer,40.605272,40.617887,37.418345,35.354375,33.312020,32.326862,417.165740,203.943868,...,155.811248,142.823807,130.565348,448.389920,277.397514,224.591595,216.160561,200.569921,185.729459,normal
37,2022-04-28,DovercourtDovercourt,33.248556,32.269614,34.574771,34.669866,34.684473,34.621717,148.998920,399.460968,...,377.574482,362.154061,346.510336,211.529680,473.054242,483.173452,457.323053,438.352965,427.683842,normal
41,2022-04-28,EastbourneEastbourne,34.615084,37.431310,35.475885,36.794119,37.479327,38.494602,28.197863,87.357517,...,68.923113,62.655493,57.737689,35.159108,125.404258,112.944919,104.844208,94.800480,86.216383,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22651,2022-09-30,SheernessSheerness,31.242859,32.562112,33.415042,33.135765,32.868070,33.060532,113.992800,98.864269,...,99.020191,96.136880,96.386383,170.433944,156.530733,156.051865,156.276031,152.250640,153.258725,normal
22653,2022-09-30,ShoeburynessShoeburyness,52.917604,36.992091,34.466266,34.075541,33.616340,33.717163,42.654904,39.957994,...,49.765210,52.205660,53.220718,68.475060,57.322729,62.563490,73.859061,77.115882,78.049625,normal
22655,2022-09-30,SkegnessSkegness,40.522840,42.290793,37.711129,34.762838,32.272673,30.560301,149.874008,291.147580,...,216.432269,207.719791,177.793142,215.546000,354.743558,301.074999,272.485160,265.479178,236.757078,normal
22670,2022-09-30,WaltonWalton,38.834528,28.369066,29.136141,29.872547,30.238851,30.612809,196.241040,235.571399,...,323.515360,337.160351,352.241641,271.272000,309.830135,373.654677,397.681277,414.205230,429.086131,normal


In [10]:
X = new_df.iloc[:, 2:-1]
y = new_df.iloc[:, -1]

y = np.array([1 if x == 'increased' else 0 for x in new_df['riskLevelLabel']])

num_positives, num_negatives = sum(y), len(y)-sum(y)
num_positives_to_repeat = int(0.5 * num_negatives * 2) - num_positives

# Oversampling
ros = RandomOverSampler(sampling_strategy={1: num_positives_to_repeat}, random_state=42)
X, y = ros.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)

Training X Shape: (3535, 18)
Training y Shape: (3535,)
Testing X Shape: (884, 18)
Testing y Shape: (884,)


In [11]:
# Fit a random forest classifier to the data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.995475113122172


Remove variables in order of feature importance.

In [15]:
new_X_train = X_train
new_X_test = X_test

for s in sorted(data[:6], key=lambda x: x[1])[:-1]:
    columns_to_drop = [col for col in new_df.columns if s[0] in col]
    
    new_X_train = new_X_train.drop(columns_to_drop, axis=1)
    new_X_test = new_X_test.drop(columns_to_drop, axis=1)
    
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(new_X_train, y_train)

    y_pred = rf.predict(new_X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy with", s[0], " removed:", accuracy)

Accuracy with 5x5  removed: 0.995475113122172
Accuracy with 9x9  removed: 0.996606334841629
Accuracy with 7x7  removed: 0.997737556561086
Accuracy with 11x11  removed: 0.996606334841629
Accuracy with 3x3  removed: 0.9920814479638009
