<a href="https://colab.research.google.com/github/cryptoholder-la/1stplace_notsorandomanymore/blob/master/TRAINING_MODEL_COMPARE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Alphanumeric CSV Data Training

# 1. Introduction
print("This notebook demonstrates how to train various machine learning models on alphanumeric CSV data.")

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files

# 2. Data Loading
print("Please upload your CSV file.")
uploaded = files.upload()

# Load the data into a Pandas DataFrame
import io
df = pd.read_csv(io.BytesIO(uploaded[next(iter(uploaded))]))
print(df.head())
print(df.info())

# 3. Data Preprocessing
# Identify numeric and categorical columns
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Remove target variable from features
target_column = df.columns[-1]  # Assuming the last column is the target
numeric_features = numeric_features.drop(target_column) if target_column in numeric_features else numeric_features
categorical_features = categorical_features.drop(target_column) if target_column in categorical_features else categorical_features

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Model Training
def create_pipeline(model):
    return Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

def train_and_evaluate(pipeline, X_train, X_test, y_train, y_test, model_name):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    return pipeline, accuracy

# a. Decision Trees
dt_pipeline, dt_accuracy = train_and_evaluate(
    create_pipeline(DecisionTreeClassifier(random_state=42)),
    X_train, X_test, y_train, y_test, "Decision Tree"
)

# b. Random Forest
rf_pipeline, rf_accuracy = train_and_evaluate(
    create_pipeline(RandomForestClassifier(random_state=42)),
    X_train, X_test, y_train, y_test, "Random Forest"
)

# c. Gradient Boosting Machines (GBM)
gbm_pipeline, gbm_accuracy = train_and_evaluate(
    create_pipeline(GradientBoostingClassifier(random_state=42)),
    X_train, X_test, y_train, y_test, "Gradient Boosting Machine"
)

# d. Support Vector Machines (SVM)
svm_pipeline, svm_accuracy = train_and_evaluate(
    create_pipeline(SVC(random_state=42)),
    X_train, X_test, y_train, y_test, "Support Vector Machine"
)

# e. Neural Networks
nn_pipeline, nn_accuracy = train_and_evaluate(
    create_pipeline(MLPClassifier(random_state=42)),
    X_train, X_test, y_train, y_test, "Neural Network"
)

# f. K-Nearest Neighbors (KNN)
knn_pipeline, knn_accuracy = train_and_evaluate(
    create_pipeline(KNeighborsClassifier()),
    X_train, X_test, y_train, y_test, "K-Nearest Neighbors"
)

# g. Ensemble Methods
ensemble_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('dt', DecisionTreeClassifier(random_state=42)),
            ('rf', RandomForestClassifier(random_state=42)),
            ('gbm', GradientBoostingClassifier(random_state=42))
        ],
        voting='soft'
    ))
])

ensemble_pipeline, ensemble_accuracy = train_and_evaluate(
    ensemble_pipeline, X_train, X_test, y_train, y_test, "Ensemble"
)

# 5. Model Evaluation
models = {
    "Decision Tree": dt_accuracy,
    "Random Forest": rf_accuracy,
    "Gradient Boosting Machine": gbm_accuracy,
    "Support Vector Machine": svm_accuracy,
    "Neural Network": nn_accuracy,
    "K-Nearest Neighbors": knn_accuracy,
    "Ensemble": ensemble_accuracy
}

best_model = max(models, key=models.get)
best_accuracy = models[best_model]

print("\nModel Comparison:")
for model, accuracy in models.items():
    print(f"{model}: {accuracy}")

# 6. Conclusion
print(f"\nThe best performing model is {best_model} with an accuracy of {best_accuracy}")
print("Future steps could include:")
print("1. Hyperparameter tuning for the best performing model")
print("2. Feature engineering to create new informative features")
print("3. Trying other advanced models or ensemble techniques")
print("4. Collecting more data if possible to improve model performance")
print("5. Analyzing feature importance to understand key drivers of the target variable")

This notebook demonstrates how to train various machine learning models on alphanumeric CSV data.
Please upload your CSV file.


Saving train.csv to train.csv
       eeg_id  eeg_sub_id  eeg_label_offset_seconds  spectrogram_id  \
0  1628180742           0                       0.0          353733   
1  1628180742           1                       6.0          353733   
2  1628180742           2                       8.0          353733   
3  1628180742           3                      18.0          353733   
4  1628180742           4                      24.0          353733   

   spectrogram_sub_id  spectrogram_label_offset_seconds    label_id  \
0                   0                               0.0   127492639   
1                   1                               6.0  3887563113   
2                   2                               8.0  1142670488   
3                   3                              18.0  2718991173   
4                   4                              24.0  3080632009   

   patient_id expert_consensus  seizure_vote  lpd_vote  gpd_vote  lrda_vote  \
0       42516          Seizure       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Neural Network Accuracy: 0.7690543071161049
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     11607
           1       0.66      0.61      0.64      2694
           2       0.61      0.59      0.60      1934
           3       0.54      0.68      0.60      1497
           4       0.59      0.43      0.50       635
           5       0.62      0.64      0.63       635
           6       0.53      0.55      0.54       420
           7       0.48      0.47      0.48       280
           8       0.38      0.43      0.41       200
           9       0.39      0.47      0.43       237
          10       0.39      0.23      0.29       190
          11       0.20      0.29      0.24       143
          12       0.40      0.17      0.24       173
          13       0.64      0.15      0.24       123
          14       0.33      0.12      0.18       107
          15       0.40      0.02      0.05        81

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Ensemble Accuracy: 0.934316479400749
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     11607
           1       0.92      0.91      0.91      2694
           2       0.90      0.88      0.89      1934
           3       0.90      0.90      0.90      1497
           4       0.96      0.93      0.94       635
           5       0.95      0.95      0.95       635
           6       0.93      0.95      0.94       420
           7       0.89      0.88      0.88       280
           8       0.77      0.81      0.79       200
           9       0.79      0.79      0.79       237
          10       0.85      0.77      0.81       190
          11       0.66      0.76      0.71       143
          12       0.74      0.66      0.70       173
          13       0.64      0.66      0.65       123
          14       0.70      0.68      0.69       107
          15       0.46      0.51      0.48        81
          16