# Regression Model Training for prediction of future ABR parameters (DeltaMeanThreshold, wave 1 amplitude, wave 1 latency)
This notebook is designed to train and evaluate regression models on auditory brainstem response (ABR) data. The goal is to predict various auditory metrics such as mean difference in thresholds, amplitude, and latency based on input features like age and frequency. The notebook includes parameter search, model training, and evaluation steps for different regression models including RandomForestRegressor.

In [None]:
%pylab
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pandas as pd
import sys
import seaborn as sns
sys.path.append('../src')
import abrTools as at
import os
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.metrics import ConfusionMatrixDisplay,confusion_matrix,classification_report
from collections import Counter
from sklearn.feature_selection import f_classif,mutual_info_classif, SelectFpr, SelectPercentile,f_regression
from sklearn.pipeline import make_pipeline


from sklearn.ensemble import RandomForestRegressor

Using matplotlib backend: <object object at 0x10912e330>
%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [None]:
from datetime import date
from joblib import dump, load
savefolder = os.path.join('..','results',str(date.today()))

if not os.path.exists(savefolder):
    os.makedirs(savefolder)

In [None]:
# Global Variables 
anovaPercentile = 10
inputFreqs = None
n_splits = 10
n_repeats = 5

In [None]:
def trainRegressionModel(model,mode,modelName,targetName,savefolder=savefolder,age='12month',inputs = ['1month','3month'],targetFreq = None,cvNjobs=1,inputFreqs=None,targetIntensity=None):
    """
    Trains a regression model using a dataset created for future threshold, wave 1 amplitude and latency prediction based on ABR data.
    
    'input' is the combination of ages to use as input features. 'age' is the target age to predict. 'target' is the target feature to predict.

    Saves:
    - Cross-validation results as a CSV file.
    - Trained model as a joblib file.
    - Test results as a CSV file.
    """
    
    X_train,  X_test,y_train,y_test,mouseIDtrain,mouseIDtest,mouseStrainTrain,mouseStrainTest,dataVersion = at.createFutureThresholdDataset(test_size=0.25,inputFreqs=inputFreqs,inputs = inputs ,
                                                                                                                                        strains = ['6N','Repaired'],target = age,mode = mode,

    X = np.vstack([X_train,X_test])
    try:
        y = np.hstack([y_train,y_test])
    except:
        y = np.vstack([y_train,y_test])

    res = at.fitRegModel(model,X_train,y_train,X_test,y_test,saveToWandb=False,
                                        dataVersion=dataVersion,calculateScores=True,makePlot=False,n_jobs=cvNjobs,n_repeats=n_repeats,n_splits=n_splits)

    model.fit(X_train,y_train) # Refit the model for good measure (this should be unnecessary)



    #Save
    pd.DataFrame(res).to_csv(os.path.join(savefolder,f'{modelName}{age}-{targetName}-{inputs}-inputFreq{inputFreqs}-resultsCV.csv'))
    dump(model,os.path.join(savefolder,f'{modelName}{age}-{targetName}-{inputs}-inputFreq{inputFreqs}-model.joblib'))
    pd.DataFrame({'y_test':y_test,'y_predict':model.predict(X_test),'Mouse ID':mouseIDtest,'Strain':mouseStrainTest}).to_csv(os.path.join(savefolder,f'{modelName}{age}-{targetName}-{inputs}-inputFreq{inputFreqs}-resultsTest.csv'))    
    

# 1 - Mean diff in thresholds

## 1.1 Parameter Search for randomforest

In [None]:
###Grid search for random forest

# from sklearn.model_selection import GridSearchCV,RepeatedKFold

# parameters = {'randomforestregressor__n_estimators': [195,200,205,],#,100,500,1000],
#               'randomforestregressor__min_samples_split': [2],
#                'randomforestregressor__min_samples_leaf': [1,2,3]
#               }

# anova_fs = SelectPercentile(f_regression,percentile=anovaPercentile)
# rfr = RandomForestRegressor(n_jobs=-1,random_state=42) # decent
# model = make_pipeline(anova_fs,rfr)

# cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
# clf = GridSearchCV(model,param_grid=parameters,scoring='neg_mean_squared_error',cv=5)
# clf.fit(X_train,y_train)
# print(clf.best_params_)



## 1.2 Train test RF regressor

In [None]:
from sklearn.linear_model import SGDRegressor 
from sktime.regression.kernel_based import RocketRegressor
import xgboost as xgb

for inputFreq in [[100]]: # Add None to train an all frequencies
    for age in ['6month','9month','12month']: # Modify this to add different age targets
        for input in [['1month','3month']]: # Modify this to train on different age inputs
            anova_fs = SelectPercentile(f_regression,percentile=anovaPercentile)
            rfr = RandomForestRegressor(n_jobs=-1,min_samples_leaf =  2,min_samples_split= 2,n_estimators=200) 
            trainRegressionModel(model,'meandiff',modelName='RandomForestRegressor',targetName='DThreshold',inputs=input,inputFreqs=inputFreq,savefolder=savefolder,age=age,cvNjobs=-1)



# 2 - amplitude and latency models

In [None]:
# Amplitude
for ti in [55,75,95]:
    for inputFreq in [[100]]:
        for age in ['6month','9month','12month']:
            for input in [['1month'],['1month','3month'],['1month','3month','6month'],['1month','3month','6month','9month']]:
                anova_fs = SelectPercentile(f_regression,percentile=anovaPercentile)
                rfr = RandomForestRegressor(n_jobs=-1,min_samples_leaf =  2,min_samples_split= 2,n_estimators=200) # decent
                model = make_pipeline(anova_fs,rfr)
                trainRegressionModel(model,'waveamp',modelName=f'RandomForestRegressor{ti}dB',targetName='waveamp',inputs=input,inputFreqs=inputFreq,savefolder=savefolder,age=age,cvNjobs=-1, targetFreq=100,targetIntensity=ti)



In [None]:
# Latency
for ti in [55,75,95]:
    for inputFreq in [[100]]:
        for age in ['6month','9month','12month']:
            for input in [['1month'],['1month','3month'],['1month','3month','6month'],['1month','3month','6month','9month']]:
                anova_fs = SelectPercentile(f_regression,percentile=anovaPercentile)
                rfr = RandomForestRegressor(n_jobs=-1,min_samples_leaf =  2,min_samples_split= 2,n_estimators=200) # decent
                model = make_pipeline(anova_fs,rfr)
                trainRegressionModel(model,'wavelatency',modelName=f'RandomForestRegressor{ti}dB',targetName='wavelatency',inputs=input,inputFreqs=inputFreq,savefolder=savefolder,age=age,cvNjobs=-1, targetFreq=100,targetIntensity=ti)
