# Pedestrian Crash Analysis

# Import Modules

In [None]:
from dataset_reader import DatasetFromFiles, DatasetReaderCSV
from pathlib import Path
from dataset_preprocessor import DatasetPreprocessor
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import utilities
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Export simulation datasets from csv

## Without timeseries

In [None]:
path = Path("C:\\Users\geork\projects\AIThesis\datasets\\20240510\mlres")
out_path = Path("C:\\Users\geork\projects\AIThesis\src\\datasets\\crash_simulation_no_timeseries.csv")
r = DatasetFromFiles(path)
r.setOutputPath(out_path, True)
r.read()

## With timeseries

In [None]:
path = Path("C:\\Users\geork\projects\AIThesis\datasets\\20240510\mlres")
out_path = Path("C:\\Users\geork\projects\AIThesis\src\\datasets\\crash_simulation_timeseries.csv")
r = DatasetFromFiles(path)
r.setTimeSeriesLabel("Head_X_Coordinate")
r.setTimeSeriesLabel("Head_Y_Coordinate")
r.setTimeSeriesLabel("Head_Z_Coordinate")
r.setTimeSeriesLabel("Sternum_X_Coordinate")
r.setTimeSeriesLabel("Sternum_Y_Coordinate")
r.setTimeSeriesLabel("Sternum_Z_Coordinate")
r.setOutputPath(out_path, True)
r.read()

# Convert csv file to Dataframe

In [None]:
path = Path("C:\\Users\geork\projects\AIThesis\src\datasets\crash_simulation_no_timeseries.csv")
reader = DatasetReaderCSV(path)

preprocessor = DatasetPreprocessor()
preprocessor.setReader(reader)
reader.read()
df = reader.convert_to_dataframe()

# Dataset Analysis

In [None]:
# Count missing values in each column
missing_values_count = df.isnull().sum()
print(missing_values_count)

## Dataset Overview

In [None]:
# Exclude the 'Position' column from the DataFrame
utilities.to_scrollable_table(df.drop(columns=["Position"], errors="ignore"))

## Dataframe description

In [None]:
summary_stats = df.describe()
print(summary_stats)

### Car Attributes: Profiles - Velocities

In [None]:
# Count the occurrences of each unique value in the 'Rotation' column and sort by label
counts_translation = df['CarProfile'].value_counts().sort_index()
counts_position = df["Velocity"].value_counts().sort_index()

# Create a figure and two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot the first bar chart
counts_translation.plot(kind='bar', ax=ax1)
ax1.set_title('CarProfile')
ax1.set_xlabel('CarProfile')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)  # Rotate x labels for better readability

# Plot the second bar chart
counts_position.plot(kind='bar', ax=ax2)
ax2.set_title('Velocity')
ax2.set_xlabel('Velocity')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)  # Rotate x labels for better readability

# Adjust layout to prevent overlap
plt.tight_layout()

# Show plot
plt.show()

### Pedestration Attributes: Translation - Rotation

In [None]:
# Convert rotation labels to integers if they are not already
df['Translation'] = df['Translation'].astype(int)
df["Rotation"] = df["Rotation"].astype(int)
    
# Count the occurrences of each unique value in the 'Rotation' column and sort by label
counts_translation = df['Translation'].value_counts().sort_index()
counts_position = df["Rotation"].value_counts().sort_index()


# Create a figure and two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot the first bar chart
counts_translation.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Translation')
ax1.set_xlabel('Translation')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)  # Rotate x labels for better readability

# Plot the second bar chart
counts_position.plot(kind='bar', ax=ax2, color='lightgreen')
ax2.set_title('Rotation')
ax2.set_xlabel('Rotation')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)  # Rotate x labels for better readability

# Adjust layout to prevent overlap
plt.tight_layout()

# Show plot
plt.show()

### Possible target value: HIC15_max / HIC36_max

In [None]:
# Create a figure and axes for the subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 4))

# Vertical lines plot for 'HIC15_max'
axes[0].vlines(df.index, ymin=0, ymax=df['HIC15_max'], color='blue', alpha=0.5)
axes[0].set_title('HIC15_max')
axes[0].set_xlabel('Index')
axes[0].set_ylabel('HIC15_max')

# Vertical lines plot for 'HIC36_max'
axes[1].vlines(df.index, ymin=0, ymax=df['HIC36_max'], color='green', alpha=0.5)
axes[1].set_title('HIC36_max')
axes[1].set_xlabel('Index')
axes[1].set_ylabel('HIC36_max')

# Adjust layout
plt.tight_layout()
plt.show()

### Head XYZ Acceleration

In [None]:
# Create a figure and axes for the subplots
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 4))

# Vertical lines plot for 'HIC15_max'
axes[0].vlines(df.index, ymin=0, ymax=df['Head_X_Acceleration_abs_max'], color='blue', alpha=0.5)
axes[0].set_title('Head_X_Acceleration_abs_max')
axes[0].set_xlabel('Index')
axes[0].set_ylabel('Head_X_Acceleration_abs_max')

# Vertical lines plot for 'HIC36_max'
axes[1].vlines(df.index, ymin=0, ymax=df['Head_Y_Acceleration_abs_max'], color='green', alpha=0.5)
axes[1].set_title('Head_Y_Acceleration_abs_max')
axes[1].set_xlabel('Index')
axes[1].set_ylabel('Head_Y_Acceleration_abs_max')

# Vertical lines plot for 'HIC36_max'
axes[2].vlines(df.index, ymin=0, ymax=df['Head_Z_Acceleration_abs_max'], color='red', alpha=0.5)
axes[2].set_title('Head_Z_Acceleration_abs_max')
axes[2].set_xlabel('Index')
axes[2].set_ylabel('Head_Z_Acceleration_abs_max')

# Adjust layout
plt.tight_layout()
plt.show()

### Brain Injury Damage(BrIC) - Chest_Resultant_Acceleration_max - Chest_Resultant_Acceleration_CLIP3ms_max

In [None]:
# Create a figure and axes for the subplots
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 4))

# Vertical lines plot for 'HIC15_max'
axes[0].vlines(df.index, ymin=0, ymax=df['BrIC_abs_max'], color='blue', alpha=0.5)
axes[0].set_title('BrIC_abs_max')
axes[0].set_xlabel('Index')
axes[0].set_ylabel('BrIC_abs_max')

# Vertical lines plot for 'HIC36_max'
axes[1].vlines(df.index, ymin=0, ymax=df['Chest_Resultant_Acceleration_max'], color='green', alpha=0.5)
axes[1].set_title('Chest_Resultant_Acceleration_max')
axes[1].set_xlabel('Index')
axes[1].set_ylabel('Chest_Resultant_Acceleration_max')

# Vertical lines plot for 'HIC36_max'
axes[2].vlines(df.index, ymin=0, ymax=df['Chest_Resultant_Acceleration_CLIP3ms_max'], color='red', alpha=0.5)
axes[2].set_title('Chest_Resultant_Acceleration_CLIP3ms_max')
axes[2].set_xlabel('Index')
axes[2].set_ylabel('Chest_Resultant_Acceleration_CLIP3ms_max')

# Adjust layout
plt.tight_layout()
plt.show()

## Binary classification: HIC15_max

### Convert HIC15 to binary value

In [None]:
threshold = 800
over_thres  = (df["HIC15_max"] > threshold).sum()
under_thres = (df["HIC15_max"] <= threshold).sum()

print("Entries over the threshold:", over_thres)
print("Entries under the threshold:", under_thres)

# Data for the bar chart
categories = ['Over Threshold', 'Under Threshold']
counts = [over_thres, under_thres]

# Create the bar chart
plt.bar(categories, counts, color=['blue', 'green'])

# Add labels and title
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.title('Entries Over and Under Threshold: 800')

# Show the plot
plt.show()

In [None]:
print(df.dtypes)

### Preprocessing for classification

In [None]:
to_remove_features = ["Id", "Position", "Path",
                      "HIC36_max", 
                      "Head_Z_Acceleration_abs_max", "Head_X_Acceleration_abs_max", "Head_Y_Acceleration_abs_max",
                      "BrIC_abs_max", 
                      "Chest_Resultant_Acceleration_max", "Chest_Resultant_Acceleration_CLIP3ms_max"]
# Remove the unwanted columns
dfn = df.drop(columns=to_remove_features)

# Create a new target column for the classification
dfn["HIC15_over_800"] = (dfn["HIC15_max"] > 800).astype(int)
dfn = dfn.drop(columns=["HIC15_max"])

# Use pd.get_dummies for encoding
dfn = pd.get_dummies(dfn, columns=["CarProfile"], drop_first=False)
print(dfn.dtypes)
print(dfn.head())


In [None]:
# Split the data into features and target
X = dfn.drop(columns=["HIC15_over_800"])
y = dfn["HIC15_over_800"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Print value counts for the last four columns
columns_to_check = ['CarProfile_FCR', 'CarProfile_MPV', 'CarProfile_RDS', 'CarProfile_SUV']
for column in columns_to_check:
    print(f"Value counts for {column}:")
    print(X_test[column].value_counts())
    print()
# Step 6: Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Binary classification

In [None]:

# Step 7: Build and train the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # For binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Plot the model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

## HIC15 regression

### Preprocessing

In [None]:
to_remove_features = ["Unnamed: 0", "Position", "Path",
                      "HIC36_max", 
                      "Head_Z_Acceleration_abs_max", "Head_X_Acceleration_abs_max", "Head_Y_Acceleration_abs_max",
                      "BrIC_abs_max", 
                      "Chest_Resultant_Acceleration_max", "Chest_Resultant_Acceleration_CLIP3ms_max"]
# Remove the unwanted columns
dfn_regr = df.drop(columns=to_remove_features)

# Use pd.get_dummies for encoding
dfn_regr = pd.get_dummies(dfn_regr, columns=["CarProfile"], drop_first=False)
print(dfn_regr.dtypes)
print(dfn_regr.head())


In [None]:
# Split the data into features and target
X = dfn_regr.drop(columns=["HIC15_max"])
y = dfn_regr["HIC15_max"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Print value counts for the last four columns
columns_to_check = ['CarProfile_FCR', 'CarProfile_MPV', 'CarProfile_RDS', 'CarProfile_SUV']
for column in columns_to_check:
    print(f"Value counts for {column}:")
    print(X_test[column].value_counts())
    print()
# Step 6: Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Regression attempt #1

In [None]:
# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32, verbose=1)

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test MAE: {mae}')

# Make predictions
predictions = model.predict(X_test)

### Regression attempt #2

In [None]:
# Build the model
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Early stopping
#early_stop = EarlyStopping(monitor='val_loss', patience=10)

# Train the model
#history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32, verbose=1, callbacks=[early_stop])
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, batch_size=32, verbose=1)


# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test MSE: {loss}, Test MAE: {mae}')

# Make predictions
predictions = model.predict(X_test)

# Plot training & validation loss values
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot training & validation MAE values
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.show()

# Scatter plot of predictions vs true values
plt.figure(figsize=(7, 7))
plt.scatter(y_test, predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()

Our model's performance, as indicated by the high loss and MAE values, suggests that it may be underfitting or the dataset may have issues that need addressing. Here are several strategies to improve your model:

1. Data Preprocessing
- Feature Engineering: Create new features that might better represent the underlying data patterns.
- Scaling: Ensure all features are properly scaled.
- Outlier Removal: Remove or cap outliers in the dataset.
- Feature Selection: Ensure only the most relevant features are included.
2. Model Architecture
- Increase Complexity: Add more layers or neurons to your model.
- Activation Functions: Try different activation functions such as LeakyReLU or ELU.
- Regularization: Adjust regularization parameters.
3. Hyperparameter Tuning
- Use tools like GridSearchCV or RandomizedSearchCV to find the best hyperparameters.
4. Training Techniques
- Early Stopping: Use EarlyStopping with a more patient threshold.
- Learning Rate Scheduling: Adjust learning rates dynamically during training.
- Batch Normalization: Add batch normalization layers.
5. Visualizing and Debugging
- Residual Analysis: Analyze the residuals to understand model errors.
- Cross-Validation: Use cross-validation to ensure robustness.


### Regression attempt #3

Changes and Additions:
- Batch Normalization: Added after each Dense layer to help stabilize and speed up training.
- Dropout: Added to prevent overfitting.
- Early Stopping: More patient with a restore best weights option.
- Learning Rate Scheduling: Reduce learning rate if validation loss plateaus.


In [None]:
# Build the model
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.01), input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10)

# Train the model
history = model.fit(X_train, y_train, epochs=200, validation_split=0.2, batch_size=32, verbose=1, callbacks=[early_stop, reduce_lr])

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test MSE: {loss}, Test MAE: {mae}')

# Make predictions
predictions = model.predict(X_test)

# Plot training & validation loss values
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot training & validation MAE values
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'])
plt.plot(history.history['val_mae'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.show()

# Scatter plot of predictions vs true values
plt.figure(figsize=(7, 7))
plt.scatter(y_test, predictions)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.show()


## Transforming a Regression Problem into a Balanced Classification Task using Quantile Binning

Splitting the target variable HIC15_max into bins with an approximately equal number of items is a method called quantile binning. <br>
This ensures that each bin contains roughly the same number of samples, regardless of the actual value distribution.<br> This method can help address issues with skewed distributions and provide more balanced classes for classification tasks.

Here's how you can implement quantile binning in Python:

Use pd.qcut to create quantile bins.
Update the target variable and re-split the dataset.
Proceed with training a classification model.<br>
Here's a step-by-step guide and the corresponding code:

Step-by-Step Guide
Create Quantile Bins:

Use pd.qcut to split HIC15_max into a specified number of quantiles, ensuring each bin has an approximately equal number of samples.<br>
Modify the Target Variable:

Replace the continuous HIC15_max values with the new quantile bins.
Split the Data and Train a Classification Model:

Split the data into training and testing sets.
Standardize the features if necessary.
Train a classification model and evaluate its performance.

Notes:
Quantile Bins: The pd.qcut function automatically determines the bin edges so that each bin contains approximately the same number of observations. <br>Adjust number_of_bins according to your needs.
Labels: Assign meaningful labels to the bins, such as Q1, Q2, etc.
Model Selection and Evaluation: Different models might perform better on this transformed problem. Experiment with various classifiers and hyperparameters.
By using quantile binning, you ensure that your classes are balanced, which can lead to better performance and more reliable evaluation metrics for your classification models.

In [None]:
# Assuming dfn_regr is your DataFrame
# Check the distribution of the target variable
plt.hist(dfn_regr['HIC15_max'], bins=50)
plt.xlabel('HIC15_max')
plt.ylabel('Frequency')
plt.title('Distribution of HIC15_max')
plt.show()

# Create quantile bins
number_of_bins = 8  # You can adjust the number of bins as needed
y_binned, bin_edges = pd.qcut(dfn_regr['HIC15_max'], q=number_of_bins, labels=[f'Q{i+1}' for i in range(number_of_bins)], retbins=True)

# Print the ranges of the bins
print("Ranges for each quantile bin:")
for i in range(len(bin_edges) - 1):
    print(f"Q{i+1}: {bin_edges[i]} to {bin_edges[i+1]}")

# Replace the continuous target variable with the new binned variable
X = dfn_regr.drop(columns=["HIC15_max"])
y = y_binned
print(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a classification model
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# Assuming dfn_regr is your DataFrame
# Check the distribution of the target variable
plt.hist(dfn_regr['HIC15_max'], bins=50)
plt.xlabel('HIC15_max')
plt.ylabel('Frequency')
plt.title('Distribution of HIC15_max')
plt.show()

# Create quantile bins
number_of_bins = 3  # You can adjust the number of bins as needed
y_binned, bin_edges = pd.qcut(dfn_regr['HIC15_max'], q=number_of_bins, labels=[f'Q{i+1}' for i in range(number_of_bins)], retbins=True)

# Print the ranges of the bins
print("Ranges for each quantile bin:")
for i in range(len(bin_edges) - 1):
    print(f"Q{i+1}: {bin_edges[i]} to {bin_edges[i+1]}")

# Replace the continuous target variable with the new binned variable
X = dfn_regr.drop(columns=["HIC15_max"])
y = y_binned
print(y)

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42)

# Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target variable to categorical
y_train_cat = to_categorical(y_train, num_classes=number_of_bins)
y_test_cat = to_categorical(y_test, num_classes=number_of_bins)

# Define the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(number_of_bins, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_cat, epochs=50, batch_size=32, validation_data=(X_test, y_test_cat))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict and evaluate
y_pred_cat = model.predict(X_test)
y_pred = np.argmax(y_pred_cat, axis=1)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()


### Model with hyperparameter tuning

In [None]:
# Assuming dfn_regr is your DataFrame
# Check the distribution of the target variable
plt.hist(dfn_regr['HIC15_max'], bins=50)
plt.xlabel('HIC15_max')
plt.ylabel('Frequency')
plt.title('Distribution of HIC15_max')
plt.show()

# Create quantile bins
number_of_bins = 8  # You can adjust the number of bins as needed
y_binned, bin_edges = pd.qcut(dfn_regr['HIC15_max'], q=number_of_bins, labels=[f'Q{i+1}' for i in range(number_of_bins)], retbins=True)

# Print the ranges of the bins
print("Ranges for each quantile bin:")
for i in range(len(bin_edges) - 1):
    print(f"Q{i+1}: {bin_edges[i]} to {bin_edges[i+1]}")

# Replace the continuous target variable with the new binned variable
X = dfn_regr.drop(columns=["HIC15_max"])
y = y_binned

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42)

# Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target variable to categorical
y_train_cat = to_categorical(y_train, num_classes=number_of_bins)
y_test_cat = to_categorical(y_test, num_classes=number_of_bins)

# Define the deep learning model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(number_of_bins, activation='softmax'))

# Compile the model with a different learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_cat, epochs=200, batch_size=32, validation_data=(X_test, y_test_cat))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict and evaluate
y_pred_cat = model.predict(X_test)
y_pred = np.argmax(y_pred_cat, axis=1)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()


### Regularization with L2

In [None]:
# Assuming dfn_regr is your DataFrame
# Check the distribution of the target variable
plt.hist(dfn_regr['HIC15_max'], bins=50)
plt.xlabel('HIC15_max')
plt.ylabel('Frequency')
plt.title('Distribution of HIC15_max')
plt.show()

# Create quantile bins
number_of_bins = 8  # You can adjust the number of bins as needed
y_binned, bin_edges = pd.qcut(dfn_regr['HIC15_max'], q=number_of_bins, labels=[f'Q{i+1}' for i in range(number_of_bins)], retbins=True)

# Print the ranges of the bins
print("Ranges for each quantile bin:")
for i in range(len(bin_edges) - 1):
    print(f"Q{i+1}: {bin_edges[i]} to {bin_edges[i+1]}")

# Replace the continuous target variable with the new binned variable
X = dfn_regr.drop(columns=["HIC15_max"])
y = y_binned

# Convert categorical labels to numerical values
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42)

# Standardize/normalize the data if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target variable to categorical
y_train_cat = to_categorical(y_train, num_classes=number_of_bins)
y_test_cat = to_categorical(y_test, num_classes=number_of_bins)

# Define the deep learning model with L2 regularization
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(number_of_bins, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train_cat, epochs=100, batch_size=32, validation_data=(X_test, y_test_cat))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {accuracy:.4f}")

# Predict and evaluate
y_pred_cat = model.predict(X_test)
y_pred = np.argmax(y_pred_cat, axis=1)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()


Determining whether you need more data depends on several factors related to your current model's performance, the complexity of the problem, and the quality of the data. Here are some key considerations to help you decide if acquiring more data would be beneficial:

Indicators That You Might Need More Data
Validation Accuracy Plateau:

If your validation accuracy has plateaued and is not improving significantly despite different training techniques, this might indicate that the model has learned as much as it can from the current dataset and additional data could help.
Overfitting Signs:

If your model's training accuracy is much higher than the validation accuracy, this indicates overfitting. More data can help the model generalize better.
High Variance in Performance:

If there is high variability in model performance across different validation sets, this could indicate that your model is not seeing enough examples to learn robust patterns.
Complexity of the Problem:

Complex problems, especially those involving high-dimensional data or numerous classes, often benefit from larger datasets to capture the necessary variability and nuances.
Current Performance Analysis
Current Accuracy:

Your model's accuracy on the test set is 49%, which suggests it is not performing optimally. While this indicates there is room for improvement, it doesn't necessarily mean more data is the only solution.
Learning Curve Analysis:

Plotting learning curves (accuracy and loss for both training and validation sets over epochs) can help you understand if more data is needed. If the validation curve shows a significant gap from the training curve that doesn't narrow with more epochs, more data could help.
Strategies to Improve Without More Data
Before deciding to gather more data, consider these strategies to improve model performance:

Data Augmentation:

Generate synthetic data through augmentation techniques such as rotations, translations, and noise addition if your data is image-based. This can effectively increase the dataset size and variability without acquiring new data.
Regularization Techniques:

Use dropout, L2 regularization, or batch normalization to prevent overfitting and help the model generalize better.
Hyperparameter Tuning:

Experiment with different hyperparameters like learning rate, batch size, and network architecture to find the optimal settings for your model.
Feature Engineering:

Improve the quality of your features or extract new, more relevant features from your existing data.
Cross-Validation:

Use k-fold cross-validation to ensure your model's performance is robust and not due to random chance in train-test splits.
Conclusion
While more data can often help improve model performance, it is not always the first or most efficient solution. Analyze your current performance, employ advanced techniques to optimize the existing model, and use learning curves to make an informed decision. If after these improvements your model still shows signs of underperformance and overfitting, acquiring more data would likely be beneficial