# Classification on `emnist`

## 1. Create `Readme.md` to document your work

Explain your choices, process, and outcomes.

## 2. Classify all symbols

### Choose a model

Your choice of model! Choose wisely...

### Train away!

Is do you need to tune any parameters? Is the model expecting data in a different format?

### Evaluate the model

Evaluate the models on the test set, analyze the confusion matrix to see where the model performs well and where it struggles.

### Investigate subsets

On which classes does the model perform well? Poorly? Evaluate again, excluding easily confused symbols (such as 'O' and '0').

### Improve performance

Brainstorm for improving the performance. This could include trying different architectures, adding more layers, changing the loss function, or using data augmentation techniques.

## 2. Classify digits vs. letters model showdown

Perform a full showdown classifying digits vs letters:

1. Create a column for whether each row is a digit or a letter
2. Choose an evaluation metric 
3. Choose several candidate models to train
4. Divide data to reserve a validation set that will NOT be used in training/testing
5. K-fold train/test
    1. Create train/test splits from the non-validation dataset 
    2. Train each candidate model (best practice: use the same split for all models)
    3. Apply the model the the test split 
    4. (*Optional*) Perform hyper-parametric search
    5. Record the model evaluation metrics
    6. Repeat with a new train/test split
6. Promote winner, apply model to validation set
7. (*Optional*) Perform hyper-parametric search, if applicable
8. Report model performance

In [1]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import emnist
from hashlib import sha1

%pip install scikit-learn xgboost

# Import necessary libraries for classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix




Collecting scikit-learn
  Using cached scikit_learn-1.4.0-1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.0 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.12.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Using cached scikit_learn-1.4.0-1-cp311-cp311-macosx_10_9_x86_64.whl (11.5 MB)
Downloading xgboost-2.0.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hUsing cached joblib-1.3.2-py3-none-any.whl (302 kB)
Using cached scipy

In [None]:
# Load the data, and reshape it into a 28x28 array

# The size of each image is 28x28 pixels
size = 28

# Extract the training split as images and labels
image, label = emnist.extract_training_samples('byclass')

# Add columns for each pixel value (28x28 = 784 columns)
raw_train = pd.DataFrame()

# Add a column showing the label
raw_train['label'] = label

# Add a column with the image data as a 28x28 array
raw_train['image'] = list(image)


# Repeat for the test split
image, label = emnist.extract_test_samples('byclass')
raw_test = pd.DataFrame()
raw_test['label'] = label
raw_test['image'] = list(image)

merged = pd.concat([raw_test, raw_train], axis=0)



In [None]:

# Assuming X contains your features and y contains your labels
# Replace this with your actual data loading/preprocessing code
X = merged.drop("label", axis=1)  # Assuming 'label' is the column you want to predict
y = merged["label"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)

# When simply training the model
# # Train the model
# decision_tree_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = decision_tree_model.predict(X_test)

# When you want to do hyperparameter tuning
# Define hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=decision_tree_model, param_grid=param_grid, cv=3, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

# Get the best model from hyperparameter tuning
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model using F1 score and confusion matrix
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(conf_matrix)

## Part 2

In [None]:
# Step 1: Recategorizing the labels into 'numbers' and 'letters' (from HW Soln 2)

# Using range(10) to identify numbers and range(10, 62) to identify letters
merged['cat_lambda'] = merged['label'].apply(lambda x: 'numbers' if x in range(10) else 'letters')

# Using a function to identify numbers and letters
def classify_label(x):
    if x in range(10):
        return 'numbers'
    elif x in range(10, 62):
        return 'letters'
    else:
        return None

merged['cat_label'] = merged['label'].apply(classify_label)

# # Using direct if statements to identify numbers and letters
# merged['cat_if'] = ['numbers' if x in range(10) else 'letters' if x in range(10, 62) else None for x in merged['label']]

# # Using pandas filters to identify numbers and letters, with the .loc() method to filter the rows
# merged['cat_filter'] = None
# merged.loc[merged['label'].isin(range(10)), 'cat_filter'] = 'numbers'

In [None]:
# Step 2: Create a validation set that will NOT be used in training/testing
X = merged.drop("cat_label", axis=1)  # Assuming 'cat_label' is the column you want to predict
y = merged["cat_label"]

# 20% for test (unseen), 80% split into 80 and 20 again, for train validation respectively
X_train_test, X_validation, y_train_test, y_validation = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 3: K-fold train/test
# Initialize models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)

# Combine models into a list for iteration
models = [("Random Forest", rf_model), ("XGBoost", xgb_model), ("Logistic Regression", lr_model)]

# Initialize StratifiedKFold for stratified K-fold cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize dictionaries to store model performance metrics
model_metrics = {"Random Forest": [], "XGBoost": [], "Logistic Regression": []}

# Step 3.1 to 3.5: K-fold cross-validation
for model_name, model in models:
    print(f"Training and evaluating {model_name}...")

    for train_idx, test_idx in stratified_kfold.split(X_train_test):
        X_train, X_test = X_train_test.iloc[train_idx], X_train_test.iloc[test_idx]
        y_train, y_test = y_train_test.iloc[train_idx], y_train_test.iloc[test_idx]

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on the test split
        y_pred = model.predict(X_test)

        # Record the model evaluation metrics
        f1 = f1_score(y_test, y_pred, average='weighted')
        model_metrics[model_name].append(f1)

# Step 4: Promote winner
best_model_name = max(model_metrics, key=lambda k: np.mean(model_metrics[k]))
best_model = next(model for model_name, model in models if model_name == best_model_name)

# Step 5: Apply the best model to the validation set
best_model.fit(X_train_test, y_train_test)
y_validation_pred = best_model.predict(X_validation)

# Step 6: Report model performance (confusion matrix)
conf_matrix = confusion_matrix(y_validation, y_validation_pred)

# Step 7: Print results
print(f"Best Model: {best_model_name}")
print("Classification Report:")
print(classification_report(y_validation, y_validation_pred))
print("Confusion Matrix:")
print(conf_matrix)
