In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_feature = pd.read_csv('~data/feature_extraction.csv')

In [3]:
riskforecasting = pd.read_csv('~data/pollution_risk_forecasting.csv')
riskforecasting = riskforecasting[['label','predictedAt','warning','riskLevelLabel']]
riskforecasting['predictedAt'] = pd.to_datetime(riskforecasting.predictedAt).dt.strftime('%Y-%m-%d')
riskforecasting = riskforecasting.rename(columns={'label': 'site', 'predictedAt': 'time'})
riskforecasting

Unnamed: 0,site,time,warning,riskLevelLabel
0,Seaton Carew North,2022-04-28,Pollution RIsk Forecasts will start soon,normal
1,Seaton Carew North,2022-04-29,Pollution RIsk Forecasts will start soon,normal
2,Seaton Carew North,2022-04-30,Pollution RIsk Forecasts will start soon,normal
3,Seaton Carew North,2022-05-04,No warnings in place,normal
4,Seaton Carew North,2022-05-05,No warnings in place,normal
...,...,...,...,...
63641,Westward Ho!,2022-09-26,No pollution incidents reported,normal
63642,Westward Ho!,2022-09-27,No pollution incidents reported,normal
63643,Westward Ho!,2022-09-28,No pollution incidents reported,normal
63644,Westward Ho!,2022-09-29,No pollution incidents reported,normal


In [4]:
df_merged = pd.merge(df_feature, riskforecasting[['site', 'time', 'riskLevelLabel']], on=['site', 'time'])
df_merged

Unnamed: 0,time,site,TUR_1x1_median,SPM_1x1_median,CHL_1x1_median,TUR_1x1_mean,SPM_1x1_mean,CHL_1x1_mean,TUR_1x1_q1,SPM_1x1_q1,...,TUR_11x11_mean,SPM_11x11_mean,CHL_11x11_mean,TUR_11x11_q1,SPM_11x11_q1,CHL_11x11_q1,TUR_11x11_q3,SPM_11x11_q3,CHL_11x11_q3,riskLevelLabel
0,2022-03-01,Saltburn,,,,,,,,,...,,,,,,,,,,increased
1,2022-03-15,Frinton,,,,,,,,,...,,,,,,,,,,increased
2,2022-03-15,Holland,,,,,,,,,...,,,,,,,,,,increased
3,2022-03-17,Frinton,121.957180,124.314230,9.526196,121.957180,124.314230,9.526196,121.957180,124.314230,...,119.602328,123.807567,8.214204,88.586640,99.837202,7.366510,141.179335,146.104910,9.199348,normal
4,2022-03-17,Holland,,,,,,,,,...,100.306580,120.473114,8.594822,87.195181,106.791605,5.838983,110.272972,130.367225,10.113032,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22676,2022-09-30,Wilsthorpe,,,,,,,,,...,,,,,,,,,,normal
22677,2022-09-30,Winchelsea,41.281174,25.715336,6.568486,41.281174,25.715336,6.568486,41.281174,25.715336,...,23.628239,15.012895,4.302087,14.766009,8.285604,3.285116,30.627869,18.688794,4.585260,normal
22678,2022-09-30,Withernsea,,,,,,,,,...,,,,,,,,,,normal
22679,2022-09-30,Worthing,,,,,,,,,...,,,,,,,,,,normal


In [5]:
tur = ["TUR_1x1_median", "TUR_1x1_mean", "TUR_1x1_q1", "TUR_1x1_q3", 
      "TUR_3x3_median", "TUR_3x3_mean", "TUR_3x3_q1", "TUR_3x3_q3", 
      "TUR_5x5_median", "TUR_5x5_mean", "TUR_5x5_q1", "TUR_5x5_q3", 
      "TUR_7x7_median", "TUR_7x7_mean", "TUR_7x7_q1", "TUR_7x7_q3", 
      "TUR_9x9_median", "TUR_9x9_mean", "TUR_9x9_q1", "TUR_9x9_q3", 
      "TUR_11x11_median", "TUR_11x11_mean", "TUR_11x11_q1", "TUR_11x11_q3"]
spm = ["SPM_1x1_median", "SPM_1x1_mean", "SPM_1x1_q1", "SPM_1x1_q3", 
       "SPM_3x3_median", "SPM_3x3_mean", "SPM_3x3_q1", "SPM_3x3_q3", 
       "SPM_5x5_median", "SPM_5x5_mean", "SPM_5x5_q1", "SPM_5x5_q3", 
       "SPM_7x7_median", "SPM_7x7_mean", "SPM_7x7_q1", "SPM_7x7_q3", 
       "SPM_9x9_median", "SPM_9x9_mean", "SPM_9x9_q1", "SPM_9x9_q3", 
       "SPM_11x11_median", "SPM_11x11_mean", "SPM_11x11_q1", "SPM_11x11_q3"]
chl = ["CHL_1x1_median", "CHL_1x1_mean", "CHL_1x1_q1", "CHL_1x1_q3",
       "CHL_3x3_median", "CHL_3x3_mean", "CHL_3x3_q1", "CHL_3x3_q3", 
       "CHL_5x5_median", "CHL_5x5_mean", "CHL_5x5_q1", "CHL_5x5_q3",        
       "CHL_7x7_median", "CHL_7x7_mean", "CHL_7x7_q1", "CHL_7x7_q3",        
       "CHL_9x9_median", "CHL_9x9_mean", "CHL_9x9_q1", "CHL_9x9_q3",        
       "CHL_11x11_median", "CHL_11x11_mean", "CHL_11x11_q1", "CHL_11x11_q3"]
tsc = [(t, s, c) for t, s, c in zip(tur, spm, chl)]

i = 1

for t, s, c in tsc:
    T = f"{t}"
    S = f"{s}"
    C = f"{c}"
        
    df_dropNaN = df_merged.dropna(subset=[T, S, C])
    X = df_dropNaN[[T, S, C]]
    y = df_dropNaN['riskLevelLabel']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    # accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    i += 1

Accuracy: 0.9882525697503671
Accuracy: 0.9809104258443465
Accuracy: 0.9765051395007343
Accuracy: 0.9779735682819384
Accuracy: 0.9758971612212105
Accuracy: 0.9817889662560257
Accuracy: 0.9764327798607392
Accuracy: 0.9807177289769684
Accuracy: 0.9751142712036567
Accuracy: 0.9751142712036567
Accuracy: 0.9740985271711529
Accuracy: 0.97866937531742
Accuracy: 0.9772267206477733
Accuracy: 0.9746963562753036
Accuracy: 0.9762145748987854
Accuracy: 0.9741902834008097
Accuracy: 0.9802731411229135
Accuracy: 0.9787556904400607
Accuracy: 0.9792615073343449
Accuracy: 0.9797673242286292
Accuracy: 0.9772612430520465
Accuracy: 0.9737241030823648
Accuracy: 0.977766548762001
Accuracy: 0.9818089944416372


In [6]:
# T = 
# S = 
# C =
# logreg.predict([[T, S, C]])

# 72

## logistic

In [7]:
df_dropNaN = df_merged.dropna(subset=["TUR_1x1_median", "TUR_1x1_mean", "TUR_1x1_q1", "TUR_1x1_q3", 
      "TUR_3x3_median", "TUR_3x3_mean", "TUR_3x3_q1", "TUR_3x3_q3", 
      "TUR_5x5_median", "TUR_5x5_mean", "TUR_5x5_q1", "TUR_5x5_q3", 
      "TUR_7x7_median", "TUR_7x7_mean", "TUR_7x7_q1", "TUR_7x7_q3", 
      "TUR_9x9_median", "TUR_9x9_mean", "TUR_9x9_q1", "TUR_9x9_q3", 
      "TUR_11x11_median", "TUR_11x11_mean", "TUR_11x11_q1", "TUR_11x11_q3", 
        "SPM_1x1_median", "SPM_1x1_mean", "SPM_1x1_q1", "SPM_1x1_q3", 
       "SPM_3x3_median", "SPM_3x3_mean", "SPM_3x3_q1", "SPM_3x3_q3", 
       "SPM_5x5_median", "SPM_5x5_mean", "SPM_5x5_q1", "SPM_5x5_q3", 
       "SPM_7x7_median", "SPM_7x7_mean", "SPM_7x7_q1", "SPM_7x7_q3", 
       "SPM_9x9_median", "SPM_9x9_mean", "SPM_9x9_q1", "SPM_9x9_q3", 
       "SPM_11x11_median", "SPM_11x11_mean", "SPM_11x11_q1", "SPM_11x11_q3", 
        "CHL_1x1_median", "CHL_1x1_mean", "CHL_1x1_q1", "CHL_1x1_q3",
       "CHL_3x3_median", "CHL_3x3_mean", "CHL_3x3_q1", "CHL_3x3_q3", 
       "CHL_5x5_median", "CHL_5x5_mean", "CHL_5x5_q1", "CHL_5x5_q3",        
       "CHL_7x7_median", "CHL_7x7_mean", "CHL_7x7_q1", "CHL_7x7_q3",        
       "CHL_9x9_median", "CHL_9x9_mean", "CHL_9x9_q1", "CHL_9x9_q3",        
       "CHL_11x11_median", "CHL_11x11_mean", "CHL_11x11_q1", "CHL_11x11_q3"])
X = df_dropNaN.iloc[:, 2:-1]
y = df_dropNaN['riskLevelLabel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.986784140969163


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## random forest

In [8]:
df_dropNaN = df_merged.dropna(subset=["TUR_1x1_median", "TUR_1x1_mean", "TUR_1x1_q1", "TUR_1x1_q3", 
      "TUR_3x3_median", "TUR_3x3_mean", "TUR_3x3_q1", "TUR_3x3_q3", 
      "TUR_5x5_median", "TUR_5x5_mean", "TUR_5x5_q1", "TUR_5x5_q3", 
      "TUR_7x7_median", "TUR_7x7_mean", "TUR_7x7_q1", "TUR_7x7_q3", 
      "TUR_9x9_median", "TUR_9x9_mean", "TUR_9x9_q1", "TUR_9x9_q3", 
      "TUR_11x11_median", "TUR_11x11_mean", "TUR_11x11_q1", "TUR_11x11_q3",
      "SPM_1x1_median", "SPM_1x1_mean", "SPM_1x1_q1", "SPM_1x1_q3", 
      "SPM_3x3_median", "SPM_3x3_mean", "SPM_3x3_q1", "SPM_3x3_q3", 
      "SPM_5x5_median", "SPM_5x5_mean", "SPM_5x5_q1", "SPM_5x5_q3", 
      "SPM_7x7_median", "SPM_7x7_mean", "SPM_7x7_q1", "SPM_7x7_q3", 
      "SPM_9x9_median", "SPM_9x9_mean", "SPM_9x9_q1", "SPM_9x9_q3", 
      "SPM_11x11_median", "SPM_11x11_mean", "SPM_11x11_q1", "SPM_11x11_q3", 
      "CHL_1x1_median", "CHL_1x1_mean", "CHL_1x1_q1", "CHL_1x1_q3",
      "CHL_3x3_median", "CHL_3x3_mean", "CHL_3x3_q1", "CHL_3x3_q3", 
      "CHL_5x5_median", "CHL_5x5_mean", "CHL_5x5_q1", "CHL_5x5_q3",        
      "CHL_7x7_median", "CHL_7x7_mean", "CHL_7x7_q1", "CHL_7x7_q3",        
      "CHL_9x9_median", "CHL_9x9_mean", "CHL_9x9_q1", "CHL_9x9_q3",        
      "CHL_11x11_median", "CHL_11x11_mean", "CHL_11x11_q1", "CHL_11x11_q3"])

X = df_dropNaN.iloc[:, 2:-1]
y = df_dropNaN.iloc[:, -1]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test.shape)
print('Testing y Shape:', y_test.shape)

Training X Shape: (1814, 72)
Training y Shape: (1814,)
Testing X Shape: (454, 72)
Testing y Shape: (454,)


In [9]:
# Fit a random forest classifier to the data
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the performance of the classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9779735682819384


### importance of feature aggregation

In [10]:
importance_dict = dict(zip(X.columns, rf.feature_importances_))

features = ['11x11', '9x9', '7x7', '5x5', '3x3', '1x1', 'TUR', 'SPM', 'CHL']

data = []
for s in features:
    sum_importance = np.sum([importance for feature, importance in importance_dict.items() if s in feature])
    mean_importance = np.mean([importance for feature, importance in importance_dict.items() if s in feature])
    data.append([s, sum_importance, mean_importance])

# Create a DataFrame with the calculated data
df_importances = pd.DataFrame(data, columns=['Feature', 'Sum of Importances', 'Mean of Importances'])

def highlight_max(s):
    if len(s) > 3:
        return ['font-weight: bold' if v in list(s[np.argsort(s)[-3:]]) else '' for v in s]
    else:
        return ['font-weight: bold' if v == s.max() else '' for v in s]

df_importances[:5].style.apply(highlight_max, subset=['Sum of Importances', 'Mean of Importances'])

Unnamed: 0,Feature,Sum of Importances,Mean of Importances
0,11x11,0.172817,0.014401
1,9x9,0.153895,0.012825
2,7x7,0.162149,0.013512
3,5x5,0.151036,0.012586
4,3x3,0.176557,0.014713


In [11]:
df_importances[-3:].style.apply(highlight_max, subset=['Sum of Importances', 'Mean of Importances'])

Unnamed: 0,Feature,Sum of Importances,Mean of Importances
6,TUR,0.327729,0.013655
7,SPM,0.310229,0.012926
8,CHL,0.362042,0.015085


## feature importance

In [12]:
importances = rf.feature_importances_
indices = importances.argsort()[::-1]
for i in indices:
    print(X.columns[i], importances[i])

SPM_11x11_median 0.023504111154456345
TUR_11x11_q1 0.022346046697644915
TUR_3x3_q3 0.020920919648778645
CHL_1x1_mean 0.020740943273007323
CHL_1x1_q1 0.02061298180639737
TUR_7x7_median 0.020113424356244608
TUR_5x5_mean 0.019211783629898088
CHL_1x1_q3 0.019122962038144693
CHL_1x1_median 0.018944535514502463
CHL_3x3_mean 0.018821928959276094
SPM_7x7_q1 0.018673847424574327
SPM_9x9_q1 0.018135353142042104
TUR_3x3_mean 0.018125604243962416
CHL_3x3_q3 0.017628079055127737
CHL_3x3_median 0.01741335901222113
CHL_9x9_mean 0.017096901430809608
CHL_7x7_mean 0.01694715737975972
SPM_7x7_median 0.016798416684272115
SPM_9x9_median 0.016137660334345997
CHL_11x11_q3 0.01606760483838969
CHL_5x5_median 0.01560872956782758
TUR_1x1_mean 0.015448199293912217
TUR_1x1_median 0.015306035720740461
CHL_5x5_q3 0.015187040214435255
CHL_5x5_mean 0.014652293045020956
CHL_9x9_q1 0.014127714487334903
CHL_11x11_mean 0.014076052632731895
SPM_11x11_q1 0.013916071861178244
SPM_1x1_median 0.013891640118138
TUR_11x11_median