# Data Preprocessing

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy import stats

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#import xgboost as xgb

from imblearn.over_sampling import SMOTE, ADASYN
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.pipeline import Pipeline as ImbPipeline

In [13]:
# Function to load specific datasets
def load_dataset():
    # Determine the current working directory
    current_dir = os.getcwd()

    # Create the path to the raw_data directory relative to the current directory
    base_dir = os.path.abspath(os.path.join(current_dir, "..", "raw_data"))

# Define the path to the All_Airports.csv file
    all_airports_file_path = os.path.join(base_dir, "All_Airports.csv")

    # Load the CSV file into a DataFrame
    df = pd.read_csv(all_airports_file_path, compression='zip')

    return df

df = load_dataset()

In [14]:
#Inspect DF

df.head()

Unnamed: 0,Time,Origin,Dest,Carrier,Cancelled,CancellationReason,Delayed,DepDelayMinutes,CarrierDelay,WeatherDelay,...,LateAircraftDelay,Temperature,Feels_Like_Temperature,Altimeter_Pressure,Sea_Level_Pressure,Visibility,Wind_Speed,Wind_Gust,Precipitation,Ice_Accretion_3hr
0,2021-01-01 09:00:00,LAX,JFK,American Airlines Inc.,False,,False,0.0,,,...,,56.0,55.9,1020.32,1020.1,16093.4,0.0,24.97,0.0,0.0
1,2021-01-02 09:00:00,LAX,JFK,American Airlines Inc.,False,,True,2.0,,,...,,54.0,54.0,1025.74,1025.5,16093.4,0.0,24.97,0.0,0.0
2,2021-01-03 09:00:00,LAX,JFK,American Airlines Inc.,False,,True,28.0,28.0,0.0,...,0.0,52.0,52.0,1024.38,1015.2,4828.02,4.6,24.97,0.0,0.0
3,2021-01-03 09:00:00,LAX,JFK,American Airlines Inc.,False,,True,28.0,28.0,0.0,...,0.0,53.0,53.1,1024.38,1024.1,6437.36,4.6,24.97,0.0,0.0
4,2021-01-04 09:00:00,LAX,JFK,American Airlines Inc.,False,,False,0.0,,,...,,51.0,51.1,1023.03,1015.2,6437.36,4.6,24.97,0.0,0.0


### Missing Values

In [15]:
#Select Numbers only
df_num = df.select_dtypes(include='number')
print(df_num.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15144514 entries, 0 to 15144513
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   DepDelayMinutes         float64
 1   CarrierDelay            float64
 2   WeatherDelay            float64
 3   NASDelay                float64
 4   SecurityDelay           float64
 5   LateAircraftDelay       float64
 6   Temperature             float64
 7   Feels_Like_Temperature  float64
 8   Altimeter_Pressure      float64
 9   Sea_Level_Pressure      float64
 10  Visibility              float64
 11  Wind_Speed              float64
 12  Wind_Gust               float64
 13  Precipitation           float64
 14  Ice_Accretion_3hr       float64
dtypes: float64(15)
memory usage: 1.7 GB
None


In [16]:
# Check Missing Values
missing_values = df.isnull().sum()

print(missing_values)

Time                             0
Origin                           0
Dest                             0
Carrier                          0
Cancelled                        0
CancellationReason        14804803
Delayed                          0
DepDelayMinutes             328891
CarrierDelay              11870186
WeatherDelay              11870186
NASDelay                  11870186
SecurityDelay             11870186
LateAircraftDelay         11870186
Temperature                      0
Feels_Like_Temperature           0
Altimeter_Pressure               0
Sea_Level_Pressure               0
Visibility                       0
Wind_Speed                       0
Wind_Gust                        0
Precipitation                    0
Ice_Accretion_3hr                0
dtype: int64


In [17]:
len(df)

15144514

In [18]:
# Display rows with missing values in 'DepDelayMinutes'
missing_dep_delay = df_num[df_num['DepDelayMinutes'].isnull()]
print(missing_dep_delay.head())

     DepDelayMinutes  CarrierDelay  WeatherDelay  NASDelay  SecurityDelay  \
82               NaN           NaN           NaN       NaN            NaN   
171              NaN           NaN           NaN       NaN            NaN   
172              NaN           NaN           NaN       NaN            NaN   
173              NaN           NaN           NaN       NaN            NaN   
174              NaN           NaN           NaN       NaN            NaN   

     LateAircraftDelay  Temperature  Feels_Like_Temperature  \
82                 NaN         58.0                    57.9   
171                NaN         47.0                    46.9   
172                NaN         47.0                    46.9   
173                NaN         46.0                    46.0   
174                NaN         47.0                    46.9   

     Altimeter_Pressure  Sea_Level_Pressure  Visibility  Wind_Speed  \
82              1019.30              1019.2    16093.40         0.0   
171             

#### Inspect Missing Values in all relevant columns

In [19]:
# Inspect specific columns with missing values
print(df[['DepDelayMinutes', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].head(20))
print(df[['CancellationReason']].head(20))

    DepDelayMinutes  CarrierDelay  WeatherDelay  NASDelay  SecurityDelay  \
0               0.0           NaN           NaN       NaN            NaN   
1               2.0           NaN           NaN       NaN            NaN   
2              28.0          28.0           0.0       2.0            0.0   
3              28.0          28.0           0.0       2.0            0.0   
4               0.0           NaN           NaN       NaN            NaN   
5               0.0           NaN           NaN       NaN            NaN   
6               0.0           NaN           NaN       NaN            NaN   
7               0.0           NaN           NaN       NaN            NaN   
8               0.0           NaN           NaN       NaN            NaN   
9               0.0           NaN           NaN       NaN            NaN   
10              0.0           NaN           NaN       NaN            NaN   
11              0.0           NaN           NaN       NaN            NaN   
12          

In [20]:
# Filter and inspect rows where DepDelayMinutes is NaN
print(df[df['DepDelayMinutes'].isnull()][['DepDelayMinutes', 'Delayed', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].head(20))

# Filter and inspect rows where WeatherDelay is NaN
print(df[df['WeatherDelay'].isnull()][['DepDelayMinutes', 'Delayed', 'WeatherDelay', 'CarrierDelay', 'NASDelay']].head(20))

# Filter and inspect rows where CancellationReason is NaN
print(df[df['CancellationReason'].isnull()][['CancellationReason', 'Cancelled']].head(20))

      DepDelayMinutes  Delayed  CarrierDelay  WeatherDelay  NASDelay  \
82                NaN    False           NaN           NaN       NaN   
171               NaN    False           NaN           NaN       NaN   
172               NaN    False           NaN           NaN       NaN   
173               NaN    False           NaN           NaN       NaN   
174               NaN    False           NaN           NaN       NaN   
175               NaN    False           NaN           NaN       NaN   
487               NaN    False           NaN           NaN       NaN   
1318              NaN    False           NaN           NaN       NaN   
1418              NaN    False           NaN           NaN       NaN   
1485              NaN    False           NaN           NaN       NaN   
1778              NaN    False           NaN           NaN       NaN   
1883              NaN    False           NaN           NaN       NaN   
2100              NaN    False           NaN           NaN      

In [21]:
# # Visualize missing data in the delay columns
# plt.figure(figsize=(10, 6))
# sns.heatmap(df[['DepDelayMinutes', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].isnull(), cbar=False, cmap='viridis')
# plt.title('Missing Data in Delay Columns')
# plt.show()

### Data Imputation for NaN

In [22]:
# Impute NaN in delay columns with 0.0
delay_columns = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
df[delay_columns] = df[delay_columns].fillna(0.0)

In [23]:
# Impute NaN in DepDelayMinutes with 0.0
df['DepDelayMinutes'] = df['DepDelayMinutes'].fillna(0.0)

In [24]:
# Impute NaN in CancellationReason with 'Not Cancelled'
df['CancellationReason'] = df['CancellationReason'].fillna('Not Cancelled')

In [25]:
# Display the first few rows after imputation to confirm changes
print(df[['DepDelayMinutes', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'CancellationReason']].head(20))

    DepDelayMinutes  CarrierDelay  WeatherDelay  NASDelay  SecurityDelay  \
0               0.0           0.0           0.0       0.0            0.0   
1               2.0           0.0           0.0       0.0            0.0   
2              28.0          28.0           0.0       2.0            0.0   
3              28.0          28.0           0.0       2.0            0.0   
4               0.0           0.0           0.0       0.0            0.0   
5               0.0           0.0           0.0       0.0            0.0   
6               0.0           0.0           0.0       0.0            0.0   
7               0.0           0.0           0.0       0.0            0.0   
8               0.0           0.0           0.0       0.0            0.0   
9               0.0           0.0           0.0       0.0            0.0   
10              0.0           0.0           0.0       0.0            0.0   
11              0.0           0.0           0.0       0.0            0.0   
12          

### Handling Outliers

In [26]:
# Identify outliers using Z-score
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
outliers = np.where(z_scores > 3)
print(f"Outliers found: {len(outliers[0])}")

Outliers found: 1881233


In [27]:
# # Visualize outliers in specific columns
# for column in df.select_dtypes(include=[np.number]).columns:
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(x=df[column])
#     plt.title(f'Boxplot of {column}')
#     plt.show()

### Manually Inspecting Extreme Outliers

In [28]:
# Define a function to identify and print extreme outliers for a given feature
def inspect_outliers(df, feature):
    # Calculate the Interquartile Range (IQR)
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    # Define outlier criteria: below Q1 - 1.5*IQR or above Q3 + 1.5*IQR
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify extreme outliers
    extreme_outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]

    # Print information about the outliers
    print(f"Feature: {feature}")
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")
    print(f"Number of Extreme Outliers: {len(extreme_outliers)}")
    print(extreme_outliers[[feature]].head(20))  # Show top 20 outliers for inspection
    print("\n" + "-"*80 + "\n")

# List of features for which to inspect outliers
features_to_inspect = [
    'DepDelayMinutes', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
    'LateAircraftDelay', 'Temperature', 'Feels_Like_Temperature', 'Altimeter_Pressure',
    'Sea_Level_Pressure', 'Visibility', 'Wind_Speed', 'Wind_Gust',
    'Precipitation', 'Ice_Accretion_3hr'
]

# Loop through each feature and inspect outliers
for feature in features_to_inspect:
    inspect_outliers(df, feature)

Feature: DepDelayMinutes
Lower Bound: -15.0
Upper Bound: 25.0
Number of Extreme Outliers: 2333489
     DepDelayMinutes
2               28.0
3               28.0
38              76.0
46              34.0
69              87.0
70              87.0
76              34.0
112            265.0
113            265.0
127             44.0
133            390.0
153             35.0
242             40.0
243             40.0
290             30.0
303             42.0
304             42.0
342             72.0
366             40.0
395            355.0

--------------------------------------------------------------------------------

Feature: CarrierDelay
Lower Bound: 0.0
Upper Bound: 0.0
Number of Extreme Outliers: 1925686
     CarrierDelay
2            28.0
3            28.0
38           59.0
46           31.0
54            9.0
112         265.0
113         265.0
127          37.0
133         390.0
153          35.0
177          14.0
181          20.0
236           2.0
242          23.0
243          23.

### Manually Treating Extreme Outliers

In [29]:
# Departure Delay Minutes
df['DepDelayMinutes'] = np.clip(df['DepDelayMinutes'], -15.0, 25.0)

In [30]:
# Carrier Delay
df['CarrierDelay'] = np.clip(df['CarrierDelay'], 0.0, 390.0)

In [31]:
#Weather Delay
df['WeatherDelay'] = np.clip(df['WeatherDelay'], 0.0, 140.0)

In [32]:
#NASDelay
df['NASDelay'] = np.clip(df['NASDelay'], 0.0, 247.0)

In [33]:
#Security Delay
df['SecurityDelay'] = np.clip(df['SecurityDelay'], 0.0, 146.0)

In [34]:
#LateAircraft Delay
df['LateAircraftDelay'] = np.clip(df['LateAircraftDelay'], 0.0, 111.0)

In [35]:
#Temperature
df['Temperature'] = np.clip(df['Temperature'], 13.5, 113.5)

In [36]:
#Feels_Like_Temperature
df['Feels_Like_Temperature'] = np.clip(df['Feels_Like_Temperature'], 13.75, 113.35)

In [37]:
#Altimeter Pressure
df['Altimeter_Pressure'] = np.clip(df['Altimeter_Pressure'], 1000.8449999999998, 1032.005)

In [38]:
#Sea_Level_Pressure
df['Sea_Level_Pressure'] = np.clip(df['Sea_Level_Pressure'], 1003.05, 1029.05)

In [39]:
#Visibility
df['Visibility'] = df['Visibility'].clip(lower=0, upper=16093.4)

In [40]:
#Wind Speed
df['Wind_Speed'] = np.clip(df['Wind_Speed'], 0.0, 23.025)

In [41]:
#Wind Gust
df['Wind_Gust'] = np.clip(df['Wind_Gust'], 20.42, 31.94)


In [42]:
#Precipitation
df['Precipitation'] = np.clip(df['Precipitation'], 0.0, 5.59)

In [43]:
#Ice Accretion
df['Ice_Accretion_3hr'] = np.clip(df['Ice_Accretion_3hr'], 0.0, 1.78)

In [44]:
# # Identify outliers using Z-score
# z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
# outliers = np.where(z_scores > 3)
# print(f"Outliers found: {len(outliers[0])}")

# # Visualize outliers in specific columns
# for column in df.select_dtypes(include=[np.number]).columns:
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(x=df[column])
#     plt.title(f'Boxplot of {column}')
#     plt.show()

#### Outliers Summary

After removing extreme outliets, a total of 1,954,889 outliers were kept. This means approximately 12.9% of our data points are classified as outliers using this method.
Given the size of our dataset, a large number of outliers might be expected, especially in a dataset with a variety of delay types and weather conditions.

### Clean Column Names to create a categorical y_pred before creating Train/Test Split

In [45]:
# Check the datatype of the 'Time' column
print(f"Data type of 'Time': {df['Time'].dtype}")

# If not datetime, convert it to datetime
if not pd.api.types.is_datetime64_any_dtype(df['Time']):
    df['Time'] = pd.to_datetime(df['Time'])
    print(f"After conversion, Data type of 'Time': {df['Time'].dtype}")

Data type of 'Time': object
After conversion, Data type of 'Time': datetime64[ns]


In [46]:
# Extract hour, day of the week, and month from the Time column
df['Hour'] = df['Time'].dt.hour
df['Day_Of_Week'] = df['Time'].dt.dayofweek
df['Month'] = df['Time'].dt.month

In [47]:
# Rename the 'WeatherDelay' column to 'Weather_Delay_Length' for clarity before we create a new 'Binary' column
df.rename(columns={'WeatherDelay': 'Weather_Delay_Length'}, inplace=True)

# Verify the change
print(df.columns)

Index(['Time', 'Origin', 'Dest', 'Carrier', 'Cancelled', 'CancellationReason',
       'Delayed', 'DepDelayMinutes', 'CarrierDelay', 'Weather_Delay_Length',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Temperature',
       'Feels_Like_Temperature', 'Altimeter_Pressure', 'Sea_Level_Pressure',
       'Visibility', 'Wind_Speed', 'Wind_Gust', 'Precipitation',
       'Ice_Accretion_3hr', 'Hour', 'Day_Of_Week', 'Month'],
      dtype='object')


In [48]:

# Create a new binary column 'WeatherDelay_Binary'
df['Weather_Delayed'] = df['Weather_Delay_Length'].apply(lambda x: 1 if x > 0 else 0)

# Check the first few rows to confirm
print(df[['Weather_Delay_Length', 'Weather_Delayed']].head(10))


   Weather_Delay_Length  Weather_Delayed
0                   0.0                0
1                   0.0                0
2                   0.0                0
3                   0.0                0
4                   0.0                0
5                   0.0                0
6                   0.0                0
7                   0.0                0
8                   0.0                0
9                   0.0                0


In [49]:

# Calculate the total number of instances
total = df['Weather_Delayed'].count()

# Calculate the percentage of each class
percent_delayed = (df['Weather_Delayed'].value_counts()[1] / total) * 100
percent_not_delayed = (df['Weather_Delayed'].value_counts()[0] / total) * 100

# Display the results
print(f"Percentage of delayed flights: {percent_delayed:.2f}%")
print(f"Percentage of not delayed flights: {percent_not_delayed:.2f}%")


Percentage of delayed flights: 1.82%
Percentage of not delayed flights: 98.18%


## Save preprocessed file before test/split and Scaling

In [50]:

# Function to save the DataFrame to the raw_data directory
def save_dataset(df,csv_name):
    # Determine the current working directory
    current_dir = os.getcwd()

    # Create the path to the raw_data directory relative to the current directory
    base_dir = os.path.abspath(os.path.join(current_dir, "..", "data"))

    # Define the path to save the treated outliers DataFrame
    treated_outliers_file_path = os.path.join(base_dir, csv_name)

    # Save the DataFrame as a CSV file
    df.to_csv(treated_outliers_file_path, index=False, compression='zip')

# Call the function to save your DataFrame
#save_dataset(df)

## Load preprocessed data

In [51]:
# Load preprocessed data set from /data folder
current_dir = os.getcwd()
base_dir = os.path.abspath(os.path.join(current_dir, "..", "data"))
file_path = os.path.join(base_dir, "preprocessed_treated_outliers.csv")

preprocessed_df = pd.read_csv(file_path)

print(preprocessed_df.head())

                  Time Origin Dest                 Carrier  Cancelled  \
0  2021-01-01 09:00:00    LAX  JFK  American Airlines Inc.      False   
1  2021-01-02 09:00:00    LAX  JFK  American Airlines Inc.      False   
2  2021-01-03 09:00:00    LAX  JFK  American Airlines Inc.      False   
3  2021-01-03 09:00:00    LAX  JFK  American Airlines Inc.      False   
4  2021-01-04 09:00:00    LAX  JFK  American Airlines Inc.      False   

  CancellationReason  Delayed  DepDelayMinutes  CarrierDelay  \
0      Not Cancelled    False              0.0           0.0   
1      Not Cancelled     True              2.0           0.0   
2      Not Cancelled     True             25.0          28.0   
3      Not Cancelled     True             25.0          28.0   
4      Not Cancelled    False              0.0           0.0   

   Weather_Delay_Length  ...  Sea_Level_Pressure  Visibility  Wind_Speed  \
0                   0.0  ...              1020.1    16093.40         0.0   
1                   0.0 

In [52]:
# Define y_pred and drop 'Weather_Delay_Length' to avoid data leakage
#X = preprocessed_df.drop(columns=['Weather_Delayed', 'Weather_Delay_Length'])
#y = preprocessed_df['Weather_Delayed']

In [53]:
X = preprocessed_df.drop(columns=['Weather_Delayed', 'Weather_Delay_Length', 'CancellationReason', 'Cancelled', 'DepDelayMinutes', 'CarrierDelay', 'NASDelay',
                    'SecurityDelay', 'LateAircraftDelay'])
y = preprocessed_df['Delayed']

In [None]:
X_reg = preprocessed_df.drop(columns=['Weather_Delayed', 'Weather_Delay_Length', 'CancellationReason', 'Cancelled', 'Delayed', 'CarrierDelay', 'NASDelay',
                    'SecurityDelay', 'LateAircraftDelay'])

y_reg = preprocessed_df['DepDelayMinutes']

In [54]:
X.columns #X features, input-able features only

Index(['Time', 'Origin', 'Dest', 'Carrier', 'Delayed', 'Temperature',
       'Feels_Like_Temperature', 'Altimeter_Pressure', 'Sea_Level_Pressure',
       'Visibility', 'Wind_Speed', 'Wind_Gust', 'Precipitation',
       'Ice_Accretion_3hr', 'Hour', 'Day_Of_Week', 'Month'],
      dtype='object')

In [55]:
y.head() #Delayed target y

0    False
1     True
2     True
3     True
4    False
Name: Delayed, dtype: bool

In [56]:
#Split into train/test
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Splitting the dataset

In [57]:
# Sample only 10% of the data for quick testing
X_sample, _, y_sample, _ = train_test_split(
    X, y,
    test_size=0.9,  # Keep only 10% for the sample
    random_state=42,
    stratify=y  # Maintain the class distribution
)

In [58]:
# Split the sampled data into train and test sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(
    X_sample, y_sample,
    test_size=0.2,  # 80% train, 20% test
    random_state=42, #Reproducibility
    stratify=y_sample  # Maintain the class distribution in the split
)

In [64]:
print(y_train_sample.value_counts(normalize=True))
print(y_test_sample.value_counts(normalize=True))

Delayed
False    0.61072
True     0.38928
Name: proportion, dtype: float64
Delayed
False    0.610721
True     0.389279
Name: proportion, dtype: float64


In [59]:
# Identify categorical, binary, and numeric features
categorical_features = ['Origin', 'Dest', 'Carrier']
#binary_features = ['Cancelled', 'Delayed']
numeric_features = ['Temperature', 'Feels_Like_Temperature',
                    'Altimeter_Pressure', 'Sea_Level_Pressure', 'Visibility', 'Wind_Speed',
                    'Wind_Gust', 'Precipitation', 'Ice_Accretion_3hr', 'Hour', 'Day_Of_Week', 'Month']

# One-hot encode categorical features
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train_sample[categorical_features])
X_test_encoded = encoder.transform(X_test_sample[categorical_features])

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sample[numeric_features]
                                      )
X_test_scaled = scaler.transform(X_test_sample[numeric_features])

#Concatenate arrays to form one array
X_train_preprocessed = np.hstack([X_train_encoded.toarray(), X_train_scaled])
X_test_preprocessed = np.hstack([X_test_encoded.toarray(), X_test_scaled])

In [48]:
'''# Identify categorical, binary, and numeric features
categorical_features = ['CancellationReason', 'Origin', 'Dest', 'Carrier']
binary_features = ['Cancelled', 'Delayed']
numeric_features = ['DepDelayMinutes', 'CarrierDelay', 'NASDelay',
                    'SecurityDelay', 'LateAircraftDelay', 'Temperature', 'Feels_Like_Temperature',
                    'Altimeter_Pressure', 'Sea_Level_Pressure', 'Visibility', 'Wind_Speed',
                    'Wind_Gust', 'Precipitation', 'Ice_Accretion_3hr', 'Hour', 'Day_Of_Week', 'Month']

# One-hot encode categorical features
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train_sample[categorical_features])
X_test_encoded = encoder.transform(X_test_sample[categorical_features])

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sample[numeric_features])
X_test_scaled = scaler.transform(X_test_sample[numeric_features])

#Concatenate arrays to form one array
X_train_preprocessed = np.hstack([X_train_encoded.toarray(), X_train_scaled, X_train_sample[binary_features]])
X_test_preprocessed = np.hstack([X_test_encoded.toarray(), X_test_scaled, X_test_sample[binary_features]])'''

In [49]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_preprocessed, y_train_sample)

In [58]:
save_dataset(pd.DataFrame(X_train_balanced), "X_train_balanced.csv")
save_dataset(pd.DataFrame(y_train_balanced), "y_train_balanced.csv")

#two optimisations to make, use balanced datasets and
# manually take a balanced y class sample of the validation set within
# the neural net

#ask andrii about the slow time of the SVM, use balanced dataset for that
#also use a TPU

#code the graphs and other evaluation visualisations while models train

#regularisation penalty on some layers

In [60]:
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed)
y_train_sample_df = pd.DataFrame(y_train_sample)
y_test_sample_df = pd.DataFrame(y_test_sample)

In [62]:
save_dataset(X_train_preprocessed_df, "X_train_preprocessed_dt_input_features_only.csv")
save_dataset(X_test_preprocessed_df, "X_test_preprocessed_dt_input_features_only.csv")
save_dataset(y_train_sample_df, "y_train_sample_dt.csv")
save_dataset(y_test_sample_df, "y_test_sample_dt.csv")

In [51]:
X_train_preprocessed.shape

(1211560, 423)

In [60]:
y_train_sample = np.array(y_train_sample)
y_test_sample = np.array(y_test_sample)
X_train_preprocessed = np.array(X_train_preprocessed)
X_test_preprocessed = np.array(X_test_preprocessed)

# Modelling with a Dense Neural Network

In [59]:
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense

# Assuming you have your input data prepared


model = Sequential()
model.add(layers.Dense(64,activation='relu', input_dim=X_train_preprocessed.shape[1]))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [61]:
model.summary()

#then must compile, fit and predict

In [62]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['recall'])

In [63]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=20, restore_best_weights=True)

In [64]:
model.fit(X_train_preprocessed, y_train_sample, batch_size=32, epochs=100, validation_split=0.3, callbacks =[es])

Epoch 1/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 514us/step - loss: 0.0460 - recall: 0.3474 - val_loss: 0.0328 - val_recall: 0.5128
Epoch 2/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 549us/step - loss: 0.0309 - recall: 0.5347 - val_loss: 0.0330 - val_recall: 0.5320
Epoch 3/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 544us/step - loss: 0.0288 - recall: 0.5644 - val_loss: 0.0308 - val_recall: 0.5814
Epoch 4/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 523us/step - loss: 0.0279 - recall: 0.5837 - val_loss: 0.0302 - val_recall: 0.5561
Epoch 5/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 516us/step - loss: 0.0269 - recall: 0.5910 - val_loss: 0.0300 - val_recall: 0.5579
Epoch 6/100
[1m26503/26503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 529us/step - loss: 0.0264 - recall: 0.5941 - val_loss: 0.0297 - val_recall: 0.5505
Epoc

<keras.src.callbacks.history.History at 0x15a98ba90>

In [65]:
loss, recall = model.evaluate(X_test_preprocessed, y_test_sample)

print(f"Test loss:{loss}")
print(f"Test recall:{recall}")

[1m9466/9466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 246us/step - loss: 0.0284 - recall: 0.6067
Test loss:0.028803272172808647
Test recall:0.6029812693595886


# SVM Classifier Modelling

In [66]:
from sklearn.svm import SVC


In [67]:
sig_svm_clf = SVC(kernel='sigmoid')


In [68]:
sig_svm_clf.fit(X_train_preprocessed, y_train_sample)

In [None]:
y_pred = sig_svm_clf.predict(X_test_preprocessed)

###### Note: Need to investigate "use_label_encoder" error