Citation

In [None]:
import os, shutil, time, csv,math,scipy,joblib,matplotlib
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from scipy.stats import pearsonr,gaussian_kde
#displaying data
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow,figure
plt.rcParams['figure.dpi'] = 300 
%matplotlib inline

## Variables and Functions

In [None]:
Train_csv = r'D:\Training_Data_Creation\Pointer_files\Train_Tiles40.csv'
Val_csv = r'D:\Training_Data_Creation\Pointer_files\Val_Tiles40.csv'
Test_csv = r'D:\Training_Data_Creation\Pointer_files\Test_Tiles40.csv'
Testp_csv = r'D:\Training_Data_Creation\Pointer_files\TestP40_Tiles50.csv'
SEED = 71

In [None]:
def process_csv_to_datasets(incsv):
    dset=pd.read_csv(incsv)
    dset_labels = dset.pop('Trueval')
    dset= np.array(dset)/65535 #divide by 65535 since that is the max 16bit int value
    dset_labels=np.array(dset_labels)/100 #divide by 100 since that is max mask pixel value
    return dset,dset_labels
#splitting the datasets into values and labels
train,train_labels=process_csv_to_datasets(Train_csv)
val,val_labels=process_csv_to_datasets(Val_csv)
test,test_labels=process_csv_to_datasets(Test_csv)
testp,testp_labels=process_csv_to_datasets(Testp_csv)

In [None]:
import ipynb.fs 
#Importing metric functions
from .defs.Thesis_Functions import dtime,calculate_metrics,time_and_metrics
#Importing plotting functions
from .defs.Thesis_Functions import make_scatter_from_results,plot_hist_save,Feature_importance

In [None]:
def save_model(path,Results,Results_str,model,model_str):
    '''used to save scikitlearn models'''
    mae,r2,dt_string=time_and_metrics(Results)
    #saves RF model
    joblib.dump(model, r"{}\{}_{}-{}_{}.joblib".format(path,model_str,Results_str,mae,r2,dt_string),compress=3)
    print('model saved')
def load_model(path):
    '''used to load scikitlearn models'''
    loaded_rf = joblib.load(path)
    print('model loaded')
    return loaded_rf

In [None]:
def create_Results_RF(model):
    '''Results is a standardize pd.dataframe used across all models. it consists of 2 columns, 0 and "pred". 
    All of the row references an instance of a test/validation dataset.'''
    #copies the labels of a dataset
    Results_val,Results_test,Results_testp=pd.DataFrame(val_labels.copy()),pd.DataFrame(test_labels.copy()),pd.DataFrame(testp_labels.copy())
    #makes a prediction from the dataset values, creates new column
    Results_val["pred"],Results_test["pred"],Results_testp["pred"]= model.predict(val),model.predict(test),model.predict(testp)
    return Results_val,Results_test,Results_testp

## Fitting the model

In [None]:
'''USE when you are ready to fully test RF model, space in n_estimators will prevent from activating'''
startmain = time.time()
best_mae=10000
n_estimators = [30,100,300,1000]
max_features,criterion = ["auto","sqrt", "log2"],"absolute_error"
for estimators in n_estimators:
    for features in max_features:
        start=time.time()
        rf_reg = RandomForestRegressor(n_estimators=estimators, max_features=features, criterion=criterion,n_jobs=-1)  # Using default values for everything else
        rf_reg.fit(train, train_labels) #fitting the model
        end=time.time()
        newX,newy = val,val_labels
        pred = rf_reg.predict(newX) #making predictions
        mae = np.mean(np.abs(newy  - pred))
        rf_val,rf_test,rf_testpt=create_Results_RF(rf_reg)
        save_model(r'D:\Training_Data_Creation\01-logs\RF_reg',rf_val,'RF_val{}{}'.format(str(estimators),features),rf_reg,'RFreg')
        Results=rf_val
        strResults='val{}{}'.format(str(estimators),features)
        make_scatter_from_results(Results,strResults,'RFreg',r'D:\Training_Data_Creation\Results_Scatter')
        print("n_estimators: " + str(estimators) + "; max_features: " + features + "; MAE: " + str(mae))
        print("Training Completed with {} estimator and {} features in {} sec".format(estimators,features,str("%.2f"%(end-start))))
        if mae < best_mae:
            best_mae,best_n_estimators,best_max_features = mae,estimators,features
print("Based on these results we will proceed with " + str(best_n_estimators) + " estimators, and " + best_max_features + " features")
endmain = time.time()
print("Overall runtime is "+str("%.2f"%(endmain-startmain))+' sec')


## Evaluating the best model on different datasets

In [None]:
loaded_rf = joblib.load(r"D:\Training_Data_Creation\01-logs\RF_reg\RFreg_RF_val1000sqrt-5.529_0.735.joblib")

In [None]:
#create all the results for different datasets
rf_val,rf_test,rf_testp=create_Results_RF(loaded_rf)

In [None]:
#Define Results dataset and strResults for evaluating and saving
Results=rf_val
strResults='val'

make_scatter_from_results(Results,strResults,'RFreg',r'D:\Training_Data_Creation\Results_Scatter')
mae,r,p=calculate_metrics(Results)


In [None]:
#dislplaying feature importance
Feature_importance(loaded_rf,val,val_labels,r'D:\Training_Data_Creation\Results_Scatter\RFreg\Feature_imp_Val_RF_REG.png')
Feature_importance(loaded_rf,test,test_labels,r'D:\Training_Data_Creation\Results_Scatter\RFreg\Feature_imp_Test_RF_REG.png')