# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load Data

In [None]:
dataframe = pd.read_csv('Data/DelayedFlights.csv', index_col=[0])

# Create Target Feature

Since only predicting cancelled flights would be highly imbalanced (0.03%) we will modify our binary classification task to the following task

- **Predicting if a flight gets *cancelled* or *diverted***

This will increase the minority class from 633 (0.03%) to 8387 (0.43%) samples

In [None]:
print('Cancelled Flights')
print(f'cancelled flights (absolute): {dataframe.Cancelled.value_counts().loc[1]}')
print(f'cancelled flights (relative): {dataframe.Cancelled.value_counts(normalize=True).loc[1]}')
print('\nDiverted Flights')
print(f'diverted flights (absolute): {dataframe.Diverted.value_counts().loc[1]}')
print(f'diverted flights (relative): {dataframe.Diverted.value_counts(normalize=True).loc[1]}')
print('\nCombined - Cancelled or Diverted')
print(f'cancelled or diverted flights (absolute): {dataframe.Cancelled.value_counts().loc[1] + dataframe.Diverted.value_counts().loc[1]}')
print(f'cancelled or diverted flights (relative): {dataframe.Cancelled.value_counts(normalize=True).loc[1] + dataframe.Diverted.value_counts(normalize=True).loc[1]}')

In [None]:
# create target feature
dataframe['Target'] = np.where((dataframe.Cancelled == 1) | (dataframe.Diverted == 1), 1, 0)

In [None]:
# drop columns Cancelled, CancellationCode and diverted
dataframe = dataframe.drop(columns=['Cancelled', 'CancellationCode', 'Diverted'])

# Split Data (quarterly/monthly)

In [None]:
def split_dataframe(dataframe=dataframe, method='quarterly'):
    # order dataframe by month
    dataframe = dataframe.sort_values(by='Month')
    
    if method == 'quarterly':
        # split dataframe (quarterly)
        dataframe_q1 = dataframe[dataframe.Month.isin([1,2,3])].drop(columns=['Year', 'Month'])
        dataframe_q2 = dataframe[dataframe.Month.isin([4,5,6])].drop(columns=['Year', 'Month'])
        dataframe_q3 = dataframe[dataframe.Month.isin([7,8,9])].drop(columns=['Year', 'Month'])
        dataframe_q4 = dataframe[dataframe.Month.isin([10,11,12])].drop(columns=['Year', 'Month'])

        return dataframe_q1, dataframe_q2, dataframe_q3, dataframe_q4

    elif method == 'monthly':
        # split dataframe (monthly)
        dataframe_m1 = dataframe[dataframe.Month.isin([1])].drop(columns=['Year', 'Month'])
        dataframe_m2 = dataframe[dataframe.Month.isin([2])].drop(columns=['Year', 'Month'])
        dataframe_m3 = dataframe[dataframe.Month.isin([3])].drop(columns=['Year', 'Month'])
        dataframe_m4 = dataframe[dataframe.Month.isin([4])].drop(columns=['Year', 'Month'])
        dataframe_m5 = dataframe[dataframe.Month.isin([5])].drop(columns=['Year', 'Month'])
        dataframe_m6 = dataframe[dataframe.Month.isin([6])].drop(columns=['Year', 'Month'])
        dataframe_m7 = dataframe[dataframe.Month.isin([7])].drop(columns=['Year', 'Month'])
        dataframe_m8 = dataframe[dataframe.Month.isin([8])].drop(columns=['Year', 'Month'])
        dataframe_m9 = dataframe[dataframe.Month.isin([9])].drop(columns=['Year', 'Month'])
        dataframe_m10 = dataframe[dataframe.Month.isin([10])].drop(columns=['Year', 'Month'])
        dataframe_m11 = dataframe[dataframe.Month.isin([11])].drop(columns=['Year', 'Month'])
        dataframe_m12 = dataframe[dataframe.Month.isin([12])].drop(columns=['Year', 'Month'])

        return dataframe_m1, dataframe_m2, dataframe_m3, dataframe_m4, dataframe_m5, dataframe_m6, dataframe_m7, dataframe_m8, dataframe_m9, dataframe_m10, dataframe_m11, dataframe_m12

In [None]:
Q1, Q2, Q3, Q4 = split_dataframe(dataframe=dataframe, method='quarterly')

# EDA

In [None]:
print(f'Rows: {Q1.shape[0]}\nColumns: {Q1.shape[1]}')

In [None]:
# first look at the data
Q1.sample(5)

In [None]:
# descriptive statistics, data types and missing values
def dataset_infos(dataframe):
    # descriptive statistics
    descriptives = dataframe.describe(include='all').T

    # data types column
    descriptives.insert(loc=0, column='dtype', value=dataframe.dtypes)
    
    # missing values
    missing = pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['nulls','empty strings'], axis=1)
    
    return pd.merge(descriptives, missing, left_index=True, right_index=True)

dataset_infos(Q1)

In [None]:
# duplicate rows
print(f'Duplicated Rows: {Q1.duplicated().sum()}')

# Preprocessing

In [None]:
x, y = Q1.drop(columns=['Target']), Q1['Target']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, stratify=y,random_state=420)

In [2]:
def preprocessing(dataframe):
  '''
  Preprocessing function
  Input: dataframe
  Output: transformed data
  '''

  # get features and label dtype
  col_names = list(dataframe.columns)
  numeric_features = list(dataframe.select_dtypes(include = np.number).columns)
  categorical_features = list(dataframe.select_dtypes(exclude = np.number).columns)

  # if not ordinal encoder in global variables
  if not 'ordenc' in globals():
    global ordenc
    ordenc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
    dataframe[categorical_features] = ordenc.fit_transform(dataframe[categorical_features])
    print('Ordinal Encoder - fit & transform...')
  else:
    dataframe[categorical_features] = ordenc.transform(dataframe[categorical_features])
    print('Ordinal Encoder - transform...')

  if not 'stdsc' in globals():
    global stdsc
    stdsc = StandardScaler()
    dataframe = stdsc.fit_transform(dataframe)
    print('Standard Scaler - fit & transform...')
  else:
    dataframe = stdsc.transform(dataframe)
    print('Standard Scaler - transform...')

  # convert back to dataframe
  dataframe = pd.DataFrame(dataframe, columns=col_names)

  # impute missing values
  dataframe_preprocessed = dataframe.fillna(dataframe.median())

  return dataframe_preprocessed

In [None]:
xtrain_prepro = preprocessing(xtrain)

In [None]:
xtest_prepro = preprocessing(xtest)

In [None]:
Q2_prepro, Q2_y = preprocessing(Q2.drop(columns=['Target'])), Q2['Target']

In [None]:
Q3_prepro, Q3_y = preprocessing(Q3.drop(columns=['Target'])), Q3['Target']

In [None]:
Q4_prepro, Q4_y = preprocessing(Q4.drop(columns=['Target'])), Q4['Target']

In [None]:
# fit prediction model on train dataset
lr = LogisticRegression(random_state=420)
lr.fit(xtrain_prepro, ytrain)
ypred = lr.predict(xtest_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(ytest, ypred)}')

In [None]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV

# define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
# define grid search
grid = GridSearchCV(LogisticRegression(random_state=420), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# fit grid search
grid.fit(xtrain_prepro, ytrain)
# best score
print(f'Best score: {grid.best_score_}')
# best estimator
print(f'Best estimator: {grid.best_estimator_}')
# best parameters
print(f'Best parameters: {grid.best_params_}')


In [None]:
# fit prediction model on train dataset (gridsearch params)
lr = LogisticRegression(**grid.best_params_, random_state=420)
lr.fit(xtrain_prepro, ytrain)
ypred = lr.predict(xtest_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(ytest, ypred)}')

In [None]:
# Q2
ypred = lr.predict(Q2_prepro)
print(f'Accuracy on Quartal2 dataset: {accuracy_score(Q2_y, ypred)}')

In [None]:
# Q3
ypred = lr.predict(Q3_prepro)
print(f'Accuracy on Quartal3 dataset: {accuracy_score(Q3_y, ypred)}')

In [None]:
# Q4
ypred = lr.predict(Q4_prepro)
print(f'Accuracy on Quartal4 dataset: {accuracy_score(Q4_y, ypred)}')

---

# Sensor

In [None]:
dataframe = pd.read_csv('Data/sensor.csv', index_col=[0])

In [None]:
# drop sensor_15
dataframe = dataframe.drop(columns=['sensor_15'])

In [None]:
# split timestamp column into year, month, day (int)
dataframe['Year'] = dataframe['timestamp'].str[:4].astype(int)
dataframe['Month'] = dataframe['timestamp'].str[5:7].astype(int)
dataframe['Day'] = dataframe['timestamp'].str[8:10].astype(int)

In [None]:
def split_dataframe(dataframe=dataframe):
    # order dataframe by month
    dataframe = dataframe.sort_values(by='Month')
    
    # split dataframe (monthly)
    dataframe_04 = dataframe[dataframe.Month.isin([4])].drop(columns=['Year', 'Month', 'timestamp'])
    dataframe_05 = dataframe[dataframe.Month.isin([5])].drop(columns=['Year', 'Month', 'timestamp'])
    dataframe_06 = dataframe[dataframe.Month.isin([6])].drop(columns=['Year', 'Month', 'timestamp'])
    dataframe_07 = dataframe[dataframe.Month.isin([7])].drop(columns=['Year', 'Month', 'timestamp'])
    dataframe_08 = dataframe[dataframe.Month.isin([8])].drop(columns=['Year', 'Month', 'timestamp'])

    return dataframe_04, dataframe_05, dataframe_06, dataframe_07, dataframe_08

df_04, df_05, df_06, df_07, df_08 = split_dataframe(dataframe=dataframe)

In [None]:
x, y = df_04.drop(columns=['machine_status']), df_04['machine_status']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, stratify=y,random_state=420)

In [None]:
le = LabelEncoder()

In [None]:
xtrain_prepro, ytrain = preprocessing(xtrain), le.fit_transform(ytrain)

In [None]:
xtest_prepro, ytest = preprocessing(xtest), le.transform(ytest)

In [None]:
# df_04
df_04_prepro, df_04_y = preprocessing(df_04.drop(columns=['machine_status'])), le.transform(df_04['machine_status'])

In [None]:
# df_05
df_05_prepro, df_05_y = preprocessing(df_05.drop(columns=['machine_status'])), le.transform(df_05['machine_status'])

In [None]:
# df_06
df_06_prepro, df_06_y = preprocessing(df_06.drop(columns=['machine_status'])), le.transform(df_06['machine_status'])

In [None]:
# df_07
df_07_prepro, df_07_y = preprocessing(df_07.drop(columns=['machine_status'])), le.transform(df_07['machine_status'])

In [None]:
# df_08
df_08_prepro, df_08_y = preprocessing(df_08.drop(columns=['machine_status'])), le.transform(df_08['machine_status'])

In [None]:
# fit prediction model on train dataset
lr = RandomForestClassifier(random_state=420)
lr.fit(xtrain_prepro, ytrain)
ypred = lr.predict(xtest_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(ytest, ypred)}')

In [None]:
# define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
# define grid search
grid = GridSearchCV(LogisticRegression(random_state=420), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# fit grid search
grid.fit(xtrain_prepro, ytrain)
# best score
print(f'Best score: {grid.best_score_}')
# best estimator
print(f'Best estimator: {grid.best_estimator_}')
# best parameters
print(f'Best parameters: {grid.best_params_}')

In [None]:
# fit prediction model on train dataset (gridsearch params)
lr = LogisticRegression(**grid.best_params_, random_state=420)
lr.fit(xtrain_prepro, ytrain)
ypred = lr.predict(xtest_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(ytest, ypred)}')

In [None]:
# df_04
ypred = lr.predict(df_04_prepro)
print(f'Accuracy on Quartal2 dataset: {accuracy_score(df_04_y, ypred)}')

In [None]:
# df_05
ypred = lr.predict(df_05_prepro)
print(f'Accuracy on Quartal2 dataset: {accuracy_score(df_05_y, ypred)}')

In [None]:
# df_06
ypred = lr.predict(df_06_prepro)
print(f'Accuracy on Quartal2 dataset: {accuracy_score(df_06_y, ypred)}')

In [None]:
# df_07
ypred = lr.predict(df_07_prepro)
print(f'Accuracy on Quartal2 dataset: {accuracy_score(df_07_y, ypred)}')

In [None]:
# # df_08
# ypred = lr.predict(df_08_prepro)
# print(f'Accuracy on Quartal2 dataset: {accuracy_score(df_08_y, ypred)}')

---

# Smoke Detector

In [3]:
dataframe = pd.read_csv('Data/smoke_detection_iot.csv', index_col=[0])

In [4]:
# function split dataframe into 3 parts (ordered by UTC)
def split_dataframe(dataframe):
    # order dataframe by UTC
    dataframe = dataframe.sort_values(by='UTC')
    
    # split dataframe into 3 equal parts
    dataframe_1 = dataframe.iloc[:int(dataframe.shape[0]/3)]
    dataframe_2 = dataframe.iloc[int(dataframe.shape[0]/3):int(dataframe.shape[0]/3)*2]
    dataframe_3 = dataframe.iloc[int(dataframe.shape[0]/3)*2:]

    return dataframe_1, dataframe_2, dataframe_3

df_1, df_2, df_3 = split_dataframe(dataframe=dataframe)

In [5]:
x, y = df_1.drop(columns=['Fire Alarm']), df_1['Fire Alarm']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, stratify=y,random_state=420)

In [6]:
xtrain_prepro = preprocessing(xtrain)

Ordinal Encoder - fit & transform...
Standard Scaler - fit & transform...


In [7]:
xtest_prepro = preprocessing(xtest)

Ordinal Encoder - transform...
Standard Scaler - transform...


In [8]:
df_1_prepro, df_1_y = preprocessing(df_1.drop(columns=['Fire Alarm'])), df_1['Fire Alarm']

Ordinal Encoder - transform...
Standard Scaler - transform...


In [9]:
df_2_prepro, df_2_y = preprocessing(df_2.drop(columns=['Fire Alarm'])), df_2['Fire Alarm']

Ordinal Encoder - transform...
Standard Scaler - transform...


In [10]:
df_3_prepro, df_3_y = preprocessing(df_3.drop(columns=['Fire Alarm'])), df_3['Fire Alarm']

Ordinal Encoder - transform...
Standard Scaler - transform...


In [11]:
# fit prediction model on train dataset
lr = RandomForestClassifier(random_state=420)
lr.fit(xtrain_prepro, ytrain)
ypred = lr.predict(xtest_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(ytest, ypred)}')

Test accuracy on normal dataset: 0.9997605363984674


In [12]:
# df_2
ypred = lr.predict(df_2_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(df_2_y, ypred)}')

Test accuracy on normal dataset: 0.8523184518106917


In [13]:
# df_3
ypred = lr.predict(df_3_prepro)
print(f'Test accuracy on normal dataset: {accuracy_score(df_3_y, ypred)}')

Test accuracy on normal dataset: 0.7291407222914073
