# 1. Downlaoding/Uploading Data

In [None]:
!git clone https://github.com/ciol-researchlab/CIOL-Winter-ML-Bootcamp.git

# 2. Setting up the enviroment

In [None]:
# Tabjular Data Analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import time
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [None]:
train_df = pd.read_csv('/content/CIOL-Winter-ML-Bootcamp/datasets/session3/main/st/train_cleaned.csv')
test_df = pd.read_csv('/content/CIOL-Winter-ML-Bootcamp/datasets/session3/main/st/test_cleaned.csv')
train_df.head()

In [None]:
# Randomly select 500 samples from train_df
train_df = train_df.sample(n=500, random_state=42)

# Randomly select 100 samples from test_df
test_df = test_df.sample(n=100, random_state=42)

In [None]:
X = train_df.drop('Transported', axis =1 )
y = train_df['Transported']

# 3. Cross Validation

Cross-validation is a technique used in machine learning to evaluate a model's performance and ensure that it generalizes well to unseen data. It involves splitting the dataset into multiple parts, training the model on some parts, and testing it on others. This helps reduce overfitting and provides a more reliable estimate of how the model will perform in real-world scenarios.


- [ML Foundation ➡️ Cross Validation ✅ All Methods](https://www.kaggle.com/code/azminetoushikwasi/ml-foundation-cross-validation-all-methods)
- [Scikit-learn | Model Selection](https://scikit-learn.org/stable/api/sklearn.model_selection.html)

In [None]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print scores of all folds
print("Scores of all folds:", scores)
print("Mean Accuracy:", np.mean(scores))

In [None]:
# Define 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print scores of all folds
print("Scores of all folds:", scores)
print("Mean Accuracy:", np.mean(scores))

In [None]:
from sklearn.model_selection import cross_validate

# Define multiple scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Perform cross-validation with multiple scoring metrics
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=True)

# Print results for each metric
for metric in scoring:
    print(f"{metric.capitalize()} scores (test): {cv_results[f'test_{metric}']}")
    print(f"Mean {metric.capitalize()} (test): {cv_results[f'test_{metric}'].mean():.4f}\n")

# Print train/test scores for accuracy
print("Train Accuracy Scores:", cv_results['train_accuracy'])
print("Test Accuracy Scores:", cv_results['test_accuracy'])

In [None]:
# Bias-Variance Tradeoff

In [None]:
# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a binary matrix to indicate fold assignments
fold_matrix = np.zeros((500, 5))

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    fold_matrix[test_index, fold] = 1  # Mark test samples as 1

# Plot the fold assignment matrix
plt.figure(figsize=(20, 10))
plt.imshow(fold_matrix, cmap="coolwarm", aspect="auto")
plt.colorbar(label="Fold Assignment (0=Train, 1=Test)")
plt.xlabel("Fold")
plt.ylabel("Sample Index")
plt.title("Visualization of Fold Assignments")
plt.show()

# Hyperparameter Tuning

Hyperparameter tuning is the process of selecting the best configuration of hyperparameters for a machine learning model to maximize its performance. Unlike model parameters (which are learned during training), hyperparameters are external configurations set before training begins.

---

**What are Hyperparameters?**
- **Model-Specific Hyperparameters**:
  - Examples:
    - Number of trees in a Random Forest (`n_estimators`).
    - Learning rate in Gradient Boosting.
    - Number of layers in a Neural Network.
- **Training-Specific Hyperparameters**:
  - Examples:
    - Batch size.
    - Number of epochs.
    - Optimizer type (e.g., Adam, SGD).

---

**Why is Hyperparameter Tuning Important?**
The choice of hyperparameters can significantly affect a model's performance. Proper tuning helps:
- Avoid overfitting or underfitting.
- Improve generalization to unseen data.
- Achieve better accuracy, precision, recall, or other metrics.

**Automated Tools**:
   - Libraries like **Optuna**, **Hyperopt**, and **Ray Tune** automate hyperparameter tuning.

---

**Steps in Hyperparameter Tuning**
1. **Define the Search Space**:
   - Decide which hyperparameters to tune and their possible values or ranges.

2. **Select a Search Strategy**:
   - Choose between grid search, random search, or advanced methods.

3. **Cross-Validation**:
   - Use k-fold cross-validation to evaluate model performance for each hyperparameter combination.

4. **Evaluate and Choose the Best**:
   - Compare models based on metrics (e.g., accuracy, precision).
   - Choose the configuration that provides the best validation score.


## **Manual Search**:
   - Trial-and-error approach based on experience.
   - Simple but not scalable for complex models.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Manual hyperparameter tuning
params = [
    {'n_estimators': 50, 'max_depth': 5},
    {'n_estimators': 100, 'max_depth': 10},
    {'n_estimators': 150, 'max_depth': None},
]

best_score = 0
best_params = {}

for param in params:
    model = RandomForestClassifier(**param, random_state=42)
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    mean_score = scores.mean()
    print(f"Params: {param}, Accuracy: {mean_score:.4f}")
    if mean_score > best_score:
        best_score = mean_score
        best_params = param

print("\nBest Parameters (Manual Search):", best_params)

## **Grid Search**:
   - Exhaustively tries all combinations of specified hyperparameter values.
   - Computationally expensive for large search spaces.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
)
grid_search.fit(X, y)

print("\nBest Parameters (Grid Search):", grid_search.best_params_)
print("Best Accuracy (Grid Search):", grid_search.best_score_)

In [None]:
grid_search_res=pd.DataFrame(grid_search.cv_results_)
grid_search_res

## **Random Search**:
   - Randomly samples hyperparameter combinations.
   - Faster than grid search for large spaces, especially when only a subset of hyperparameters has a significant impact.

In [None]:
np.arange(50, 201, 10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define hyperparameter distribution
param_dist = {
    'n_estimators': np.arange(50, 201, 10),
    'max_depth': [5, 10, None],
    'min_samples_split': np.arange(2, 21),
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1,
)
random_search.fit(X, y)

print("\nBest Parameters (Random Search):", random_search.best_params_)
print("Best Accuracy (Random Search):", random_search.best_score_)


## **Bayesian Optimization**:
   - Models the relationship between hyperparameters and performance.
   - Efficiently finds optimal parameters using probabilistic methods.

In [None]:
!pip install optuna

In [None]:
import optuna
from sklearn.model_selection import cross_val_score

# Define the objective function for optimization
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
    )
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return scores.mean()

# Create a study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("\nBest Parameters (Bayesian Optimization):", study.best_params)
print("Best Accuracy (Bayesian Optimization):", study.best_value)


# Dimensionality Reduction

Dimensionality reduction is the process of reducing the number of input variables (features) in a dataset while retaining as much relevant information as possible. It simplifies data analysis and visualization, especially in high-dimensional datasets, by projecting data into a lower-dimensional space.

---

### **Why is Dimensionality Reduction Done?**
1. **Avoid Overfitting**:
   - High-dimensional data often suffers from overfitting due to the curse of dimensionality.

2. **Improve Performance**:
   - Reducing dimensions can speed up training for machine learning models and reduce computational cost.

3. **Visualization**:
   - High-dimensional data is difficult to visualize. Dimensionality reduction techniques like t-SNE and UMAP can project data into 2D or 3D for better understanding.

4. **Noise Removal**:
   - Helps remove irrelevant or redundant features that can add noise to the model.

5. **Reduce Storage**:
   - A compact representation of data can save storage and improve efficiency.


**Curse of dimensionality** refers to the challenges faced when working with high-dimensional data. As the number of features increases, the volume of the space grows exponentially, requiring more data to maintain statistical significance. Distance metrics become less meaningful, leading to difficulties in clustering and classification. This sparsity can cause models to overfit and generalize poorly. Additionally, computational costs and the complexity of visualizing high-dimensional data increase. Mitigating the curse involves techniques like dimensionality reduction (e.g., PCA, t-SNE), feature selection, and regularization to simplify the data and improve model performance.

## **1. Principal Component Analysis (PCA)**
- **What it Does**:
  - Projects data into a new coordinate system where the axes (principal components) capture the maximum variance in the data.
  - Retains only the top `k` principal components to reduce dimensions.

- **How it Works**:
  1. Compute the covariance matrix of the data.
  2. Compute the eigenvalues and eigenvectors of the covariance matrix.
  3. Sort eigenvectors by eigenvalues (largest to smallest) to rank the principal components.
  4. Project data onto the top `k` eigenvectors.

- **Advantages**:
  - Preserves global structure.
  - Linear method: fast and interpretable.

- **Limitations**:
  - Assumes linear relationships.
  - Sensitive to outliers and scaling.

In [None]:
from sklearn.decomposition import PCA

# PCA
pca = PCA(n_components=2)

X_pca = pca.fit_transform(X)

In [None]:
X.shape

In [None]:
X_pca.shape

## **2. t-Distributed Stochastic Neighbor Embedding (t-SNE)**
- **What it Does**:
  - Maps high-dimensional data into a lower-dimensional space (often 2D or 3D) while preserving the local structure of the data (i.e., clusters).

- **How it Works**:
  1. Converts distances between points in high-dimensional space into probabilities (similarity).
  2. Does the same for the lower-dimensional space.
  3. Minimizes the Kullback-Leibler (KL) divergence between the two probability distributions.

- **Advantages**:
  - Excellent for visualizing clusters and local structure.
  - Non-linear and handles complex manifolds.

- **Limitations**:
  - Computationally expensive for large datasets.
  - Cannot preserve global structure.
  - Results are non-deterministic without setting a random seed.


In [None]:
from sklearn.manifold import TSNE

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)

In [None]:
X_tsne.shape

In [None]:
# Initialize a figure
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
axes[0].set_title('PCA')
axes[0].set_xlabel('Principal Component 1')
axes[0].set_ylabel('Principal Component 2')

axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.7)
axes[1].set_title('t-SNE')
axes[1].set_xlabel('t-SNE Component 1')
axes[1].set_ylabel('t-SNE Component 2')

# Show the plots
plt.tight_layout()
plt.show()

## **3. Uniform Manifold Approximation and Projection (UMAP)**
- **What it Does**:
  - Similar to t-SNE but focuses on both local and global structure. Faster and often better for clustering tasks.

- **How it Works**:
  1. Constructs a high-dimensional graph based on distances between points.
  2. Optimizes a low-dimensional graph to maintain relationships between points.
  3. Uses stochastic gradient descent to map high-dimensional points to a low-dimensional space.

- **Advantages**:
  - Faster and more scalable than t-SNE.
  - Preserves both local and global structure.
  - Deterministic if initialized correctly.

- **Limitations**:
  - Sensitive to hyperparameters (e.g., `n_neighbors` and `min_dist`).


In [None]:
!pip install umap-learn

In [None]:
import umap

# Initialize UMAP
umap_model = umap.UMAP(n_components=2, random_state=42)

# Apply UMAP to reduce dimensions
X_umap = umap_model.fit_transform(X)

In [None]:
X_umap.shape

In [None]:
# Plot the results
plt.figure(figsize=(8, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, cmap='viridis', alpha=0.7)
plt.title('UMAP Dimension Reduction')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.colorbar(label='Class Labels')
plt.show()

## Comparison

**Comparison of PCA, t-SNE, and UMAP**

| Feature                | PCA                  | t-SNE                  | UMAP                  |
|------------------------|----------------------|------------------------|-----------------------|
| **Type**              | Linear               | Non-linear             | Non-linear            |
| **Speed**             | Fast                 | Slow                   | Faster than t-SNE     |
| **Scalability**       | High                 | Medium                 | High                  |
| **Global Structure**  | Preserved            | Not preserved          | Partially preserved   |
| **Local Structure**   | Weakly preserved     | Strongly preserved     | Strongly preserved    |
| **Use Case**          | Data compression, preprocessing | Visualization | Visualization, clustering |

---

**How to Choose the Right Algorithm**
- **PCA**:
  - Use when global structure is important, and relationships are linear.
  - Ideal for preprocessing data for machine learning.

- **t-SNE**:
  - Best for exploring and visualizing clusters in small to medium-sized datasets.

- **UMAP**:
  - Preferable for large datasets or when clustering and preserving both local and global structures are essential.


# Artificial Neural Networks

### **What is an Artificial Neural Network (ANN)?**

An **Artificial Neural Network (ANN)** is a computational model inspired by the way biological neural networks in the human brain work. It consists of layers of interconnected nodes (neurons), where each connection has a weight that is adjusted during learning. ANNs are used for a wide range of machine learning tasks, such as classification, regression, and pattern recognition.

- **Structure**:
  - **Input Layer**: Receives the input data.
  - **Hidden Layers**: Layers between input and output that process the data through weighted connections.
  - **Output Layer**: Produces the final output or prediction.

### **What is Deep Learning?**

**Deep Learning** is a subset of machine learning that utilizes deep neural networks, which are ANNs with many hidden layers (hence the term "deep"). Deep learning algorithms automatically learn hierarchical features from data, making them especially powerful for complex tasks like image and speech recognition, natural language processing, and game playing.

- **Deep Networks**: Have multiple hidden layers that allow them to learn increasingly abstract representations of data.
- **Learning**: Deep learning models learn directly from raw data, requiring minimal feature engineering.

### **Why are ANN and Deep Learning Necessary?**

1. **Handling Complex Data**
2. **Automated Feature Extraction**
3. **Improved Accuracy**
4. **Scalability**
5. **End-to-End Learning**
6. **Advances in Hardware**

## Optimizers

Optimizers in deep learning are algorithms used to adjust the weights of the model in order to minimize the loss function. Here’s a breakdown of the most commonly used optimizers:

#### **Stochastic Gradient Descent (SGD)**
   - **How it works**: Updates the weights using a fraction of the training data (mini-batches) in each iteration.
   - **Advantages**: Simple and effective for many problems, works well with large datasets.
   - **Disadvantages**: It can get stuck in local minima, and often requires careful tuning of the learning rate.
   
   **Formula**:  
   $w = w - \eta \cdot \nabla L(w)$
   where $ \eta $ is the learning rate and $ \nabla L(w) $ is the gradient of the loss function with respect to the weights.


#### **Adam (Adaptive Moment Estimation)**
   - **How it works**: Combines the advantages of both SGD with momentum and RMSprop (adaptive learning rate).
   - **Advantages**: Adaptive learning rates for each parameter and typically requires less tuning.
   - **Disadvantages**: Can sometimes lead to overfitting and may be sensitive to large batch sizes.

   **Formula**:  
   $m_t = \beta_1 m_{t-1} + (1 - \beta_1) \nabla L(w)$
   $v_t = \beta_2 v_{t-1} + (1 - \beta_2) (\nabla L(w))^2$
   $w = w - \frac{\eta}{\sqrt{v_t} + \epsilon} \cdot m_t$
   where $ m_t $ is the first moment (mean) and $ v_t $ is the second moment (variance).


#### 5. **RMSprop**
   - **How it works**: Modifies Adagrad to help the learning rate from decaying too fast, especially for non-convex optimization problems.
   - **Advantages**: Works well for a wide range of problems and does not suffer from rapid learning rate decay.
   - **Disadvantages**: May still require fine-tuning of parameters.
   
   **Formula**:  
   $v_t = \beta v_{t-1} + (1 - \beta) (\nabla L(w))^2$
   $w = w - \frac{\eta}{\sqrt{v_t + \epsilon}} \cdot \nabla L(w)$

---


## Loss Functions

Loss functions (or cost functions) quantify the difference between the predicted values and the actual values. Depending on the problem (classification or regression), different loss functions are used.

#### For **Classification**:

1. **Cross-Entropy Loss (Log Loss)**:
   - **Used for**: Multi-class and binary classification tasks.
   - **How it works**: Measures the difference between two probability distributions – the predicted probability distribution and the true distribution.
   - **Formula**:
     - For binary classification:
       $
       L(y, \hat{y}) = -[y \cdot \log(\hat{y}) + (1 - y) \cdot \log(1 - \hat{y})]
    $
     - For multi-class classification:
       $
       L(y, \hat{y}) = - \sum_{i=1}^{C} y_i \cdot \log(\hat{y}_i)
    $
     where $y_i$ is the true label and $\hat{y}_i$ is the predicted probability for class $i$.

2. **Sparse Categorical Cross-Entropy Loss**:
   - **Used for**: Multi-class classification with integer labels (as opposed to one-hot encoding).
   - **How it works**: Similar to cross-entropy loss but accepts integer labels.
   
3. **Binary Cross-Entropy (BCE)**:
   - **Used for**: Binary classification.
   - **Formula**:  
     $
     L(y, \hat{y}) = -[y \log(\hat{y}) + (1 - y) \log(1 - \hat{y})]
  $
     where $y$ is the actual class and $\hat{y}$ is the predicted probability.

#### For **Regression**:

1. **Mean Squared Error (MSE) Loss**:
   - **Used for**: Regression tasks where the output is a continuous value.
   - **How it works**: Measures the average of the squared differences between predicted and actual values.
   - **Formula**:
     $
     L(y, \hat{y}) = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2
  $
     where $y_i$ is the true value and $\hat{y}_i$ is the predicted value.

2. **Mean Absolute Error (MAE) Loss**:
   - **Used for**: Regression tasks, less sensitive to outliers than MSE.
   - **How it works**: Measures the average of the absolute differences between predicted and actual values.
   - **Formula**:
     $
     L(y, \hat{y}) = \frac{1}{N} \sum_{i=1}^{N} |y_i - \hat{y}_i|
  $

3. **Huber Loss**:
   - **Used for**: Regression tasks where you want to combine the benefits of MSE and MAE.
   - **How it works**: Combines the best features of both MSE and MAE by using MSE when errors are small and MAE when errors are large.
   - **Formula**:
     $
     L(y, \hat{y}) =
     \begin{cases}
     \frac{1}{2} (y - \hat{y})^2 & \text{for } |y - \hat{y}| \leq \delta \\
     \delta |y - \hat{y}| - \frac{1}{2} \delta^2 & \text{otherwise}
     \end{cases}
  $
     where $ \delta $ is a hyperparameter that controls the point at which the loss function transitions from quadratic to linear.

### When to Choose Each Optimizer and Loss Function

1. **For Classification**:
   - **Use Cross-Entropy Loss**: When you need a model to predict probabilities across multiple classes.
   - **Optimizer**: Adam is commonly used as it combines the benefits of momentum and adaptive learning rates, making it robust for most tasks. SGD can work well too, but requires careful tuning of the learning rate.

2. **For Regression**:
   - **Use MSE or MAE Loss**: If the task involves predicting continuous values.
   - **Optimizer**: Adam or RMSprop can be used for efficient convergence in regression tasks.

3. **For Binary Classification**:
   - **Use Binary Cross-Entropy Loss**: For binary classification tasks.
   - **Optimizer**: Adam or SGD can be used effectively.

## Activation Functions

Activation functions are mathematical functions applied to the output of a node (or neuron) in an artificial neural network. They introduce non-linearity into the model, which allows the network to learn complex patterns and make better predictions. Without activation functions, the neural network would be just a linear regression model, unable to solve non-linear problems.

Here are the most commonly used activation functions in deep learning:

---

#### 1. **Sigmoid (Logistic) Function**
   - **Formula**:$\sigma(x) = \frac{1}{1 + e^{-x}}$
   - **Range**: \( (0, 1) \)
   - **Use Case**: Often used in the output layer for binary classification tasks (output probabilities).
   - **Advantages**: Provides outputs that are easy to interpret as probabilities.
   - **Disadvantages**: Can suffer from the **vanishing gradient problem** (gradients become very small for large input values), which can slow down training.
  
---

#### 2. **Hyperbolic Tangent (Tanh)**
   - **Formula**:
     $
     \tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}
     $
   - **Range**: \( (-1, 1) \)
   - **Use Case**: Often used in hidden layers, especially for tasks where outputs need to be centered around 0.
   - **Advantages**: Unlike Sigmoid, Tanh is zero-centered, meaning the output has both negative and positive values, which helps reduce bias during training.
   - **Disadvantages**: Like the Sigmoid function, Tanh can also suffer from the **vanishing gradient problem** for large values of \( x \).
  
---

#### 3. **ReLU (Rectified Linear Unit)**
   - **Formula**:
     $
     \text{ReLU}(x) = \max(0, x)
     $
   - **Range**: \( [0, \infty) \)
   - **Use Case**: Most commonly used activation function for hidden layers in deep networks.
   - **Advantages**: Computationally efficient and less likely to suffer from the vanishing gradient problem compared to Sigmoid and Tanh.
   - **Disadvantages**: Can suffer from the **dying ReLU problem**, where neurons can become inactive and always output zero if the weights are updated in a way that causes them to fall into a region where the derivative is zero.

---

#### 4. **Leaky ReLU**
   - **Formula**:
     $
     \text{Leaky ReLU}(x) = \max(\alpha x, x)
     $
     where \( \alpha \) is a small constant (e.g., 0.01).
   - **Range**: \( (-\infty, \infty) \)
   - **Use Case**: Used to address the dying ReLU problem by allowing a small negative slope for \( x < 0 \).
   - **Advantages**: Helps to prevent neurons from "dying" by allowing a small negative gradient when \( x < 0 \).
   - **Disadvantages**: The choice of \( \alpha \) is a hyperparameter that needs to be tuned.

---

#### 5. **Parametric ReLU (PReLU)**
   - **Formula**:
     $
     \text{PReLU}(x) = \max(\alpha x, x)
     $
     where \( \alpha \) is learned during training.
   - **Range**: \( (-\infty, \infty) \)
   - **Use Case**: A generalization of Leaky ReLU where the negative slope \( \alpha \) is a parameter that can be learned during training.
   - **Advantages**: More flexible than Leaky ReLU since \( \alpha \) is not fixed and can be optimized during training.
   - **Disadvantages**: Increased model complexity due to learning an additional parameter.

---

#### 6. **Softmax**
   - **Formula**:
     $
     \text{Softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}
     $
     where \( x_i \) is the input to the \( i \)-th neuron, and the denominator is the sum of exponentials of all inputs.
   - **Range**: \( (0, 1) \) for each output, and the sum of all outputs equals 1.
   - **Use Case**: Used in the output layer for multi-class classification problems.
   - **Advantages**: Converts raw scores into probabilities, making it suitable for classification tasks with multiple classes.
   - **Disadvantages**: Can be computationally expensive for very large datasets, as it requires calculating the exponentials of all inputs.

---

#### 7. **Swish**
   - **Formula**:
     $
     \text{Swish}(x) = x \cdot \sigma(x)
     $
     where \( \sigma(x) \) is the Sigmoid function.
   - **Range**: \( (-\infty, \infty) \)
   - **Use Case**: Used in deep learning models, especially for very deep networks.
   - **Advantages**: It is a smooth, non-monotonic function that tends to perform better than ReLU in some cases.
   - **Disadvantages**: More computationally expensive than ReLU.

---

#### 8. **Softplus**
   - **Formula**:
     $
     \text{Softplus}(x) = \log(1 + e^x)
     $
   - **Range**: \( (0, \infty) \)
   - **Use Case**: A smooth approximation of ReLU.
   - **Advantages**: Avoids the sharp cutoff that ReLU introduces and can be differentiable everywhere.
   - **Disadvantages**: Computationally more expensive than ReLU.

---

### Choosing the Right Activation Function

- **For Hidden Layers**:
  - **ReLU** is the most widely used due to its simplicity and effectiveness, especially when used with large networks.
  - **Leaky ReLU** or **PReLU** is often chosen when ReLU's "dying neuron" issue is a concern.
  - **Tanh** can be used when the outputs should be zero-centered.

- **For Output Layer**:
  - **Sigmoid**: Best for binary classification when the output needs to be a probability.
  - **Softmax**: Used for multi-class classification problems where the output represents probabilities of each class.
  - **Linear**: Used for regression tasks where the output can be any real number.

### Summary
- **ReLU** is the default for hidden layers, and **Sigmoid** or **Softmax** is typically used for output layers in classification tasks.
- Activation functions help the neural network learn complex patterns and allow for the non-linear transformation of data, which is crucial for tasks like classification and regression.


## Example 1: Classification (Spaceship Titaic)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, average_precision_score

In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Assuming X, y are in the form of pandas DataFrame/Series or numpy arrays
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
X.shape[1]

In [None]:
# Define a simple ANN model
class ANN_Model(nn.Module):
    def __init__(self):
        super(ANN_Model, self).__init__()
        self.layer1 = nn.Linear(X.shape[1], 32)
        self.layer2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 2)  # For binary classification (0, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.output(x)

In [None]:
# Initialize model, loss function, and optimizer
model = ANN_Model().to(device)
criterion = nn.CrossEntropyLoss()  # Suitable for multi-class, works with binary classification too
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    y_true = []
    y_pred = []

    # Training loop with tqdm
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", ncols=100):
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Collect the true labels and predictions
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

    # Calculate training metrics
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)

    print(f"Train Loss: {running_loss / len(train_loader):.4f}, Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")

    # Validation loop
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Calculate validation metrics
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)

    print(f"Validation Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")


## Example 2: Regression (FoodCourt)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [None]:
X = train_df.drop(['Transported','FoodCourt'], axis =1 )
y = train_df['FoodCourt']

In [None]:
# Assuming X_df, y_df are pandas DataFrames/Series
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)  # For regression, reshape y

In [None]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# Define a simple ANN model for regression
class ANN_Model(nn.Module):
    def __init__(self):
        super(ANN_Model, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 32)
        self.layer2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 1)  # For regression output

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.output(x)

In [None]:
# Initialize model, loss function, and optimizer
model = ANN_Model().to(device)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    y_true = []
    y_pred = []

    # Training loop with tqdm
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", ncols=100):
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Collect the true labels and predictions
        running_loss += loss.item()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(outputs.cpu().detach().numpy())

    # Calculate training metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"Train Loss: {running_loss / len(train_loader):.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

    # Validation loop
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(outputs.cpu().detach().numpy())

    # Calculate validation metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"Validation MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")