In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
file_path = Path('saveris_alarms_weather.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,date,location,status,alarm_type,limit,value,platform_1,platform_2,deepfreezer_1,...,freezer_2,freezer_3,freezer_4,cooling_1,cooling_2,sensor_alarm,temp,temp_min,temp_max,humidity
0,2,2021-06-24 17:00:00,Freezer,0,recovery,-19,-20.0,2.3,0.1,-23.0,...,-19.9,-20.0,-21.6,3.4,3.5,freezer_3,19.68,18.79,24.18,62.0
1,3,2021-06-24 17:30:00,Deepfreezer,0,recovery,-25,-25.9,2.4,0.6,-25.9,...,-20.8,-21.2,-22.1,3.1,3.2,deepfreezer_1,19.68,18.79,24.18,62.0
2,4,2021-06-24 17:30:00,Deepfreezer,0,recovery,-25,-25.0,2.4,0.6,-25.9,...,-20.8,-21.2,-22.1,3.1,3.2,deepfreezer_2,19.68,18.79,24.18,62.0
3,5,2021-06-24 17:45:00,Platform,1,deficit,1,0.6,2.4,0.6,-26.4,...,-21.0,-21.5,-22.2,3.1,3.2,platform_2,19.68,18.79,24.18,62.0
4,6,2021-06-24 19:15:00,Platform,1,deficit,1,0.8,0.8,-0.9,-27.2,...,-21.0,-22.2,-22.4,3.0,3.1,platform_1,19.68,18.79,24.18,62.0


In [5]:

columns = ["Unnamed: 0", "date", "location", "status", "alarm_type", "limit", "value", "platform_1" "platform_2", "deepfreezer_1", 
          "deepfreezer_2", "freezer_1", "freezer_2", "freezer_3", "freezer_4", "cooling_1", "cooling_2", "sensor_alarm", "temp", 
           "temp_min", "temp_max", "humidity"]

target = ["status"]

In [6]:

df = df.drop(["Unnamed: 0", "date", "location", "alarm_type", "sensor_alarm"], axis=1)
df.head()


Unnamed: 0,status,limit,value,platform_1,platform_2,deepfreezer_1,deepfreezer_2,freezer_1,freezer_2,freezer_3,freezer_4,cooling_1,cooling_2,temp,temp_min,temp_max,humidity
0,0,-19,-20.0,2.3,0.1,-23.0,-24.0,-20.2,-19.9,-20.0,-21.6,3.4,3.5,19.68,18.79,24.18,62.0
1,0,-25,-25.9,2.4,0.6,-25.9,-25.0,-20.9,-20.8,-21.2,-22.1,3.1,3.2,19.68,18.79,24.18,62.0
2,0,-25,-25.0,2.4,0.6,-25.9,-25.0,-20.9,-20.8,-21.2,-22.1,3.1,3.2,19.68,18.79,24.18,62.0
3,1,1,0.6,2.4,0.6,-26.4,-25.4,-21.0,-21.0,-21.5,-22.2,3.1,3.2,19.68,18.79,24.18,62.0
4,1,1,0.8,0.8,-0.9,-27.2,-26.2,-21.1,-21.0,-22.2,-22.4,3.0,3.1,19.68,18.79,24.18,62.0


# Split the Data into Training and Testing

In [7]:
#"limit", "value", "platform_1", "platform_2", "deepfreezer_1", "deepfreezer_2", "freezer_1", "freezer_2", "freezer_3", "freezer_4", "cooling_1", "cooling_2",
# Create our features
X_df = df.drop(columns=['status',"limit", "value", "platform_1", "platform_2", "deepfreezer_1", 
          "deepfreezer_2", "freezer_1", "freezer_2", "freezer_3", "freezer_4", "cooling_1", "cooling_2",])
X = pd.get_dummies(X_df, columns=[ "temp", 
           "temp_min", "temp_max", "humidity"], drop_first = True)

# Create our target
y = df["status"].to_frame()

In [8]:
X.describe()

Unnamed: 0,temp_4.9,temp_5.1,temp_6.13,temp_6.42,temp_6.43,temp_6.98,temp_7.03,temp_7.07,temp_7.25,temp_7.36,...,humidity_91.0,humidity_92.0,humidity_93.0,humidity_94.0,humidity_95.0,humidity_96.0,humidity_97.0,humidity_98.0,humidity_99.0,humidity_100.0
count,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,...,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0,6671.0
mean,0.000899,0.00045,0.0006,0.00075,0.0006,0.00045,0.00015,0.001049,0.00045,0.0003,...,0.024434,0.027432,0.019487,0.028631,0.008544,0.018888,0.009144,0.004197,0.005996,0.000899
std,0.029979,0.021203,0.024481,0.027369,0.024481,0.021203,0.012243,0.032379,0.021203,0.017314,...,0.154404,0.163351,0.138241,0.166781,0.092047,0.136139,0.095193,0.064655,0.077208,0.029979
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y['status'].value_counts()

1    3367
0    3304
Name: status, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [11]:
# Scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Resample the training data with the RandomOversampler
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators =1000, random_state=1)
model = brfc.fit(X_train_scaled, y_train)
BalancedRandomForestClassifier()

BalancedRandomForestClassifier()

In [12]:
# Calculate the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

predictions = model.predict(X_test_scaled)
accuracy_score(y_test, predictions)

0.421462829736211

In [13]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[394, 432],
       [533, 309]], dtype=int64)

In [14]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.43      0.48      0.45       826
           1       0.42      0.37      0.39       842

    accuracy                           0.42      1668
   macro avg       0.42      0.42      0.42      1668
weighted avg       0.42      0.42      0.42      1668



In [15]:
# List the features sorted in descending order by feature importance
sorted(zip(model.feature_importances_, X.columns), reverse=True)[:50]

[(0.00765404386805077, 'temp_min_13.18'),
 (0.006847744202266353, 'temp_min_12.18'),
 (0.006288517155920897, 'temp_min_14.18'),
 (0.006266520674537201, 'temp_min_15.18'),
 (0.005545127163471599, 'humidity_86.0'),
 (0.005518158994367253, 'humidity_42.0'),
 (0.0055076820769127365, 'temp_min_11.18'),
 (0.005505911128200364, 'temp_min_10.18'),
 (0.0054278527984573935, 'humidity_91.0'),
 (0.005286791846640689, 'humidity_88.0'),
 (0.005229144597963429, 'humidity_82.0'),
 (0.005112592926275138, 'humidity_79.0'),
 (0.005051563254993504, 'temp_min_9.18'),
 (0.00502373463359844, 'humidity_83.0'),
 (0.004934524363813802, 'humidity_93.0'),
 (0.004931756490242379, 'temp_min_16.18'),
 (0.00493086123029079, 'temp_min_8.18'),
 (0.00447859872936256, 'temp_min_18.18'),
 (0.004467848298154612, 'humidity_52.0'),
 (0.004440216808567484, 'humidity_84.0'),
 (0.00440680976549151, 'humidity_92.0'),
 (0.004399447404799622, 'humidity_57.0'),
 (0.004345053138229096, 'humidity_38.0'),
 (0.004336981960839042, 'humi

### Easy Ensemble AdaBoost Classifier

In [16]:
# Train the EasyEnsembleClassifier

from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1, random_state=1, 
                                   replacement=False, sampling_strategy='auto', verbose=0, 
                                   warm_start=False)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

EasyEnsembleClassifier()

EasyEnsembleClassifier()

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
predictions = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.47949077775157734

In [18]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[695, 131],
       [743,  99]], dtype=int64)

In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.48      0.84      0.61       826
           1       0.43      0.12      0.18       842

    accuracy                           0.48      1668
   macro avg       0.46      0.48      0.40      1668
weighted avg       0.46      0.48      0.40      1668

