In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import resample
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.utils import resample

from joblib import Parallel, delayed
from IPython import embed

from src.utils import make_partitions

In [10]:
import numpy as np
import pandas as pd
import importlib
import tqdm
import sys

In [14]:
from src.utils import format_data

In [11]:
sys.path.append('./configs')

In [29]:
configs=importlib.import_module('experiment_14')

In [25]:
from src.utils import make_partitions

In [30]:
def train_model(df_train,feature_tags,label_tags,seed,rf_n_jobs=None):  
    
    X_train,Y_train=split_X_Y(df_train,feature_tags,label_tags)
    
    RF_reg=RandomForestRegressor(random_state=seed,n_jobs=rf_n_jobs) 
    
    RF_reg.fit(X_train.values,Y_train.values)
 
    return RF_reg

def split_X_Y(df,feature_tags,label_tags):
    
    df_X=df[feature_tags]
    df_Y=df[label_tags]

    return df_X, df_Y
  
def predict(RF_reg, val,feature_tags,label_tags):
    
    X_val, Y_val= split_X_Y(val,feature_tags,label_tags)

    predictions=RF_reg.predict(X_val.values)

    r2=r2_score(Y_val, predictions)    

    MAE=mean_absolute_error(Y_val, predictions)
    MSE=mean_squared_error(Y_val, predictions)
    RMSE=np.sqrt(mean_squared_error(Y_val, predictions))

    return r2,MAE,MSE,RMSE,Y_val,predictions

In [39]:
def cross_val(df,feature_tags=configs.feature_tags,label_tags=configs.label_tags,seed=configs.seed,rf_n_jobs=configs.rf_n_jobs,n_jobs=configs.n_jobs): 
    def func(i):
        partition=make_partitions(5)

        feature_importance=[]
        metrics_list=[]
        predictions_all=np.array([], dtype=np.int64).reshape(0,5)
        y_val_all=pd.DataFrame()
        
        # Partitioning options
        
        stratify=True

        if stratify: 
            df_final=partition.make_strat_folds(df) #se puede agregar random_seed
        else:
            df_final=partition.make_folds_by_id(df)

        # Run cross Val

        for fold in range(5):
            df_val=df_final[df_final['fold']==float(fold)]
            df_train=df_final[~df_final['basename'].isin(df_val.basename)]
            RF_reg= train_model (df_train,feature_tags,label_tags,seed,rf_n_jobs=rf_n_jobs)
                    
            r2_all,MAE_all,MSE_all,RMSE_all,y_val,predictions= predict(RF_reg,df_val,feature_tags,label_tags)
            metrics=[r2_all,np.sqrt(r2_all),MAE_all,MSE_all,RMSE_all,fold]
            metrics_list.append(metrics)
            
            predictions_all=np.concatenate((predictions_all,predictions),axis=0)
            y_val_all=pd.concat([y_val_all,y_val])

            #feature_importance.append(RF_reg.feature_importances_)

        r2_fold=r2_score(y_val_all, predictions_all)
        metrics_list=np.transpose(metrics_list)
        df_fold=pd.DataFrame({'r2':metrics_list[0],'r':metrics_list[1],'MAE':metrics_list[2],'MSE':metrics_list[3],'RMSE':metrics_list[4],'fold':metrics_list[5],'r2_fold':r2_fold,'seed':i})
        
        return df_fold,y_val_all

    return df_,y_val_all_

In [67]:
feat_df=pd.read_csv('data/features/new_partitions-egemaps_all_audio.csv')
labels_df=pd.read_csv('data/labels/stratified_df.csv')
filter='data/lists/all_audio_complete_set.txt'

In [68]:
df=format_data(feat_df,labels_df,filter)

In [41]:
df_train=df.sample(n=8000,replace=True,random_state=42)
df_val=df[~df['basename'].isin(df_train['basename'])]
feature_tags=configs.feature_tags
label_tags=configs.label_tags
seed=configs.seed
RF_reg= train_model (df_train,feature_tags,label_tags,seed)

In [69]:
r2_all,MAE_all,MSE_all,RMSE_all,y_val,predictions= predict(RF_reg,df_val,feature_tags,label_tags)

In [70]:
preds_flat=pd.DataFrame(predictions).values.flatten()
y_val_flat=y_val.values.flatten()

In [72]:
r2_score(y_val_flat,preds_flat)

0.23902438252371694

In [2]:
def demo(n_samples=1):
    return print(n_samples)

demo(n_samples=None)


None
