# Random Forest Explainability Analysis

Importing necessary packages:

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import graphviz
import seaborn as sns

UTILS:

In [6]:
import warnings
warnings.filterwarnings("ignore")

DATABASE_DIR = '/home/macdowell/Workspace/Explainability-Model-Analysis/database/'
OUTPUT_DIR = 'output/RandomForest/'

def get_entries_name_inside_dir(path):
    smell_folders = os.listdir(path)
    return smell_folders

def dataframe_appended_from_smell_folder(folder_path):
    csv_files = os.listdir(folder_path)
    df = None
    
    for csv in csv_files:
        if 'lock' in csv:
            continue
        
        if df is None:
            df = pd.read_csv(folder_path + '/' + csv)
        else:
            df_aux = pd.read_csv(folder_path + '/' + csv)
            df = pd.concat([df, df_aux])
    return df

def dict_to_csv(output_path, dict_to_save, column_name):
    print(output_path)
    df = pd.DataFrame.from_dict(dict_to_save, orient='columns')
    df.index.names = [column_name]
    df.to_csv(output_path + '.csv')

### Effectivity on differents database's percents:

- RQ1)

In [7]:
def evaluate_model(X, y, df, train_percent, effectivity_dict, smell, cv_splits):
    
    ss = ShuffleSplit(n_splits=cv_splits, train_size=train_percent, test_size=0.25)
    score_list = []
    
    for train_index, test_index in ss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        
        score = perform_models(
            [
                RandomForestClassifier()
            ],
            X_train, X_test,
            y_train, y_test,
            df,
            train_percent,
            effectivity_dict,
            smell
        )
        
        score_list.append(score)
        
        # Adding scores into dict:
    effectivity_dict[smell][train_percent] = sum(score_list)/len(score_list)
        

def perform_models(classifiers, X_train, X_test, y_train, y_test, df, train_percent, effectivity_dict, smell):

    for classifier in classifiers:
        # Creating key index in dict to save evaluation metrics value:
        #string += classifier.__class__.__name__

        # Train:
        classifier.fit(X_train, y_train)
        
        #TODO: Make Viz woks
        # Viz:
        '''
        dot_data = tree.export_graphviz(classifier, out_file=None, 
                      feature_names=df.columns,  
                      class_names=['smell', 'not smell'],
                      filled=True, rounded=True,  
                      special_characters=True)  
        graph = graphviz.Source(dot_data)  
        graph
        '''
        
        # Predicting values with model:
        predicteds = classifier.predict(X_test)
        
        # Getting score metrics:
        return f1_score(y_test, predicteds)
        

smell_folders = get_entries_name_inside_dir(DATABASE_DIR)
train_percents = [0.25, 0.5, 0.75]

effectivity_dict = {}

for smell in smell_folders:
    smell_metrics_df = dataframe_appended_from_smell_folder(DATABASE_DIR + smell)

    effectivity_dict[smell] = {}
    
    for percent in train_percents:
        
        effectivity_dict[smell][percent] = -1
        
        if '0' in smell_metrics_df.columns:
            smell_metrics_df = smell_metrics_df.drop(columns=['0'], axis=1)

        for c in smell_metrics_df.columns:
            smell_metrics_df[c] = np.nan_to_num(smell_metrics_df[c])

        X = smell_metrics_df.iloc[:, smell_metrics_df.columns != 'Smell']
        y = smell_metrics_df.iloc[:, smell_metrics_df.columns == 'Smell']
        
        evaluate_model(X, y, smell_metrics_df, percent, effectivity_dict, smell, cv_splits=5)

#print(effectivity_dict)
# Writing results in CSV
dict_to_csv(OUTPUT_DIR + 'RQ1', effectivity_dict, 'train_percent')

output/RandomForest/RQ1


### PLOT ABOUT RQ1)

In [8]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
#Size of the plot
plt.rcParams["figure.figsize"] = [9,6]
df = pd.read_csv(OUTPUT_DIR + '/RQ1.csv')

for smell in smell_folders:
    ax = sns.lineplot(x="train_percent", y=smell, data=df)
    ax.set(xlabel='Database train percent', ylabel='f1 score')
    fig = ax.get_figure()
    fig.savefig("output/RandomForest/RQ1/"+ smell + ".png")
    fig.clf()

<Figure size 648x432 with 0 Axes>