# Create Tables

In [15]:
# Packages
import pandas as pd
import dataframe_image as dfi # NOTE: YOU MUST HAVE GOOGLE CHROME INSTALLED FOR THIS TO WORK CORRECTLY

## Set Classifier Name

In [16]:
classifier_name = 'XGBoost'

## List of Rating Models and Most Complex Model

In [17]:
model_names = ['rating_model_1', 'rating_model_2', 'rating_model_3']
clean_model_names = ["Altman's Z", 'Financial Variables and Sector', 'Financial Variables, Sector, and NLP Features']
# set most_complex_model 
most_complex_model = 'rating_model_3'

## Variable Index

In [18]:
# Load variable index
variable_index = pd.read_excel('../../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,Y,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,Y,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...
200,operatingCashFlowPerShare_diff,Difference in Operating Cash Flow Per Share fr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
201,freeCashFlowPerShare_diff,Difference in Free Cash Flow Per Share from pr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
202,cashPerShare_diff,Difference in Cash Per Share from prior fixed ...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
203,operatingCashFlowToSales_diff,Difference in Operating Cash Flow to Sales fro...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X


## Model Comparison

In [19]:
def get_model_comparison_row(model_name, clean_model_name):
    '''
    Given the model name and clean model name, this function returns the model comparison row.
    '''

    # Load close_exact_dict
    close_exact_dict = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_close_exact_dict.pkl')
    # Version with each item rounded to 4 decimal places
    close_exact_dict_rounded = {k: round(v, 4) for k, v in close_exact_dict.items()}
    # Unpack
    exact_predictions_share = close_exact_dict_rounded['exact_predictions_share']
    close_predictions_share = close_exact_dict_rounded['close_predictions_share']

    # Load acc_f1_majority
    acc_f1_majority = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_acc_f1_majority.pkl')
    # Version with each item rounded to 2 decimal places
    acc_f1_majority_rounded = {k: round(v, 4) for k, v in acc_f1_majority.items()}
    # Unpack
    accuracy = acc_f1_majority_rounded['accuracy']
    f1 = acc_f1_majority_rounded['f1_score']
    majority_baseline = acc_f1_majority_rounded['majority_baseline']

    # Check exact_predictions_share == accuracy
    print('exact predictions share == accuracy:', exact_predictions_share == accuracy)

    # Get weighted average precision and recall from classification report
    classification_report = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_classification_report.pkl')
    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] in ['weighted']]
    # Unpack
    weighted_avg_precision = classification_report_data[0][2]
    weighted_avg_recall = classification_report_data[0][3]

    # Create dataframe row
    model_comparison_row = pd.DataFrame({
        'Model/Baseline': [clean_model_name],
        'Accuracy': [accuracy],
        'Weighted Average Precision': [weighted_avg_precision],
        'Weighted Average Recall': [weighted_avg_recall],
        'F1 Score': [f1],
        'Share 1 Rating Or Less From Actual': [close_predictions_share]
    })

    # Return row
    return model_comparison_row, majority_baseline

## Model Comparison Table - Include and Exclude Previous Rating Versions

In [20]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Create list of df rows
    model_comparison_rows = []
    majority_baselines = []
    for model_name, clean_model_name in zip(model_names, clean_model_names):
        model_comparison_row, majority_baseline = get_model_comparison_row(include_exclude_previous + model_name, clean_model_name)
        model_comparison_rows.append(model_comparison_row)
        majority_baselines.append(majority_baseline)

    # Concatenate rows
    model_comparison_df = pd.concat(model_comparison_rows)

    # Check majority baselines are the same
    print('Majority baselines are the same:', all([majority_baseline == majority_baselines[0] for majority_baseline in majority_baselines]))
    # Add row with Model/Baseline = 'Most Common Class Baseline' and Accuracy = majority_baseline[0]
    model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({
        'Model/Baseline': ['Most Common Class Baseline'],
        'Accuracy': [majority_baselines[0]],
        'Weighted Average Precision': [''],
        'Weighted Average Recall': [''],
        'F1 Score': [''],
        'Share 1 Rating Or Less From Actual': ['']
    })])

    # if include set include_exclude_previous_model_comparison_df
    if include_exclude_previous == 'include_previous_':
        include_previous_model_comparison_df = model_comparison_df
    else:
        exclude_previous_model_comparison_df = model_comparison_df

    # Export to Excel
    model_comparison_df.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df.xlsx', index = False)

    # Export to LaTeX
    # Format columns
    for col in model_comparison_df.columns:
        model_comparison_df[col] = model_comparison_df[col].apply(lambda x: f'{x:.4f}' if isinstance(x, float) else x)
    # Rename 'Share 1 Rating Or Less From Actual' to 'Share $le$ Rating From Actual'
    model_comparison_df.rename(columns={'Share 1 Rating Or Less From Actual': 'Share $\\le$ 1 Rating From Actual'}, inplace=True)
    # Center all columns
    lt_string = model_comparison_df.to_latex(index=False, column_format='c' * len(model_comparison_df.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df.tex', 'w') as f:
        f.write(latex_with_font_size)

    # Middle version latex - keep only columns 'Model/Baseline', 'Accuracy', 'Share $\\le$ 1 Rating From Actual'
    model_comparison_df_middle = model_comparison_df[['Model/Baseline', 'Accuracy', 'Share $\\le$ 1 Rating From Actual']]
    # Center all columns
    lt_string = model_comparison_df_middle.to_latex(index=False, column_format='c' * len(model_comparison_df_middle.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df_middle.tex', 'w') as f:
        f.write(latex_with_font_size)

    # Smaller version latex - keep only columns 'Model/Baseline', 'Accuracy'
    model_comparison_df_smaller = model_comparison_df[['Model/Baseline', 'Accuracy']]
    # Center all columns
    lt_string = model_comparison_df_smaller.to_latex(index=False, column_format='c' * len(model_comparison_df_smaller.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df_smaller.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(model_comparison_df)

    # If include_previous, add a row that is the share of items with "Same" in test data
    if include_exclude_previous == 'include_previous_':
        test_data_example = pd.read_excel('../../../../Data/Predictions/Logistic Regression/rating_change_model_1/rating_change_model_1_predictions.xlsx')
        print(test_data_example)
        same_share = test_data_example[test_data_example['Change Direction Since Last Fixed Quarter Date'] == 'Same As Last Fixed Quarter Date'].shape[0] / test_data_example.shape[0]
        # round to 4 decimal places
        same_share = round(same_share, 4)
        print('same share')
        print(same_share)
        # Create dataframe row
        same_share_row = pd.DataFrame({
            'Model/Baseline': ['Previous Rating'],
            'Accuracy': [same_share],
            'Weighted Average Precision': [''],
            'Weighted Average Recall': [''],
            'F1 Score': [''],
        })
        # Concatenate rows
        model_comparison_df_ss = pd.concat([model_comparison_df, same_share_row])
        # Export to Excel
        model_comparison_df_ss.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df_ss.xlsx', index = False)

exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True
                                  Model/Baseline Accuracy  \
0                                     Altman's Z   0.3855   
0                 Financial Variables and Sector   0.7630   
0  Financial Variables, Sector, and NLP Features   0.9034   
0                     Most Common Class Baseline   0.3247   

  Weighted Average Precision Weighted Average Recall F1 Score  \
0                     0.3468                  0.3855   0.3408   
0                     0.7679                  0.7630   0.7575   
0                     0.9040                  0.9034   0.9017   
0                                                               

  Share $\le$ 1 Rating From Actual  
0                           0.7657  
0                           0.9597  
0                           0.9857  
0                                   
exact predictions s

In [21]:
# Styling
#print(exclude_previous_model_comparison_df)
exclude_previous_model_comparison_sty = (exclude_previous_model_comparison_df.reset_index(drop=True)[['Model/Baseline', 'Accuracy']]
                            .style
                            .format(precision=2, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(exclude_previous_model_comparison_sty, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'exclude_previous_' + 'model_comparison_df.png')
#exclude_previous_model_comparison_sty

include_previous_model_comparison_sty = (include_previous_model_comparison_df.reset_index(drop=True)[['Model/Baseline', 'Accuracy']]
                            .style
                            .format(precision=2, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(include_previous_model_comparison_sty, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'include_previous_' + 'model_comparison_df.png')
#include_previous_model_comparison_sty

model_comparison_df_ss = (model_comparison_df_ss.reset_index(drop=True)[['Model/Baseline', 'Accuracy']]
                            .style
                            .format(precision=4, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(model_comparison_df_ss, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'include_previous_' + 'model_comparison_df_ss.png')
#model_comparison_df_ss

## Most Complex Classification Rep

In [22]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load classificiation report from pickle
    classification_report = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_classification_report.pkl')
    print(classification_report)

    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] not in ['precision', 'accuracy', 'macro', 'weighted']]
    # Stack list of rows into dataframe
    classification_report_data = pd.DataFrame(classification_report_data)
    # Set columns to "Rating", "Precision", "Recall", "F1-Score", "Support"
    classification_report_data.columns = ['Rating', 'Precision', 'Recall', 'F1-Score', 'Support']
    # Sort by Rating in correct order: AAA, AA, A, BBB, BB, B, CCC, CC, C, D
    rating_map = {'AAA': 0, 'AA': 1, 'A': 2, 'BBB': 3, 'BB': 4, 'B': 5, 'CCC': 6, 'CC': 7, 'C': 8, 'D': 9}
    classification_report_data['Rating Num'] = classification_report_data['Rating'].map(rating_map)
    classification_report_data = classification_report_data.sort_values(by='Rating Num').drop(columns='Rating Num')
    print(classification_report_data)

    # Export to Excel
    classification_report_data.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Classification_Report.xlsx', index=False)

    # Export to Latex
    #classification_report_data.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', index=False)

    # Export to LaTeX
    # Format columns
    for col in classification_report_data.columns:
        classification_report_data[col] = classification_report_data[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
    # Center all columns
    lt_string = classification_report_data.to_latex(index=False, column_format='c' * 5, escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Classification_Report.tex', 'w') as f:
        f.write(latex_with_font_size)

              precision    recall  f1-score   support

           A     0.8447    0.8894    0.8665       208
          AA     0.8824    0.5769    0.6977        52
         AAA     0.9048    0.7917    0.8444        24
           B     0.9272    0.9091    0.9180       154
          BB     0.9357    0.9225    0.9291       284
         BBB     0.9129    0.9532    0.9326       363
           C     1.0000    1.0000    1.0000         4
          CC     1.0000    0.5000    0.6667         2
         CCC     0.7857    0.8462    0.8148        26
           D     1.0000    1.0000    1.0000         1

    accuracy                         0.9034      1118
   macro avg     0.9193    0.8389    0.8670      1118
weighted avg     0.9040    0.9034    0.9017      1118

  Rating Precision  Recall F1-Score Support
2    AAA    0.9048  0.7917   0.8444      24
1     AA    0.8824  0.5769   0.6977      52
0      A    0.8447  0.8894   0.8665     208
5    BBB    0.9129  0.9532   0.9326     363
4     BB    0.9357  0

## Most Complex Hyperparameters

In [23]:
# List to store best parameters dfs
best_params_dfs = []

# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load pickle
    best_params = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_best_params.pkl')
    print(best_params)

    # Convert to dataframe
    best_params = pd.DataFrame(best_params, index=[0])

    # Set columns
    print(classifier_name)
    if "include_previous_" in include_exclude_previous:
        best_params.columns = ['booster', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimators', 'objective']
    else:
        best_params.columns = ['booster', 'gamma','learning_rate', 'max_depth', 'min_child_weight', 'n_estimators', 'objective']
    # Replace 'Multi-Class Strategy' values
    #best_params['Multi-Class Strategy'] = best_params['Multi-Class Strategy'].replace({'ovr': 'One vs Rest', 'multinomial': 'Multinomial'})
    # Replace 'Penalty' values
    #best_params['Penalty'] = best_params['Penalty'].replace({'l1': 'L1', 'l2': 'L2', 'elasticnet': 'Elastic Net', 'none': 'None'})
    # Replace 'Solver' values
   # best_params['Solver'] = best_params['Solver'].replace({'newton-cg': 'Newton Conjugate Gradient', 'lbfgs': 'Limited Memory Broyden–Fletcher–Goldfarb–Shanno', 'liblinear': 'Library for Large Linear Classification', 'sag': 'Stochastic Average Gradient', 'saga': 'SAGA'})
    # Replace Class Weighting Strategy values
    #best_params['Class Weighting Strategy'] = best_params['Class Weighting Strategy'].replace({'balanced': 'Balanced', None: 'None'})
    
    # Column at the front for whether previous ratings are included or excluded
    best_params.insert(0, 'Previous Ratings', include_exclude_previous[:-1].replace('_', ' ').title())

    print(best_params)

    #best_params = best_params.reset_index(drop=True)

    # Rename booster to Booster, gamma to Gamma, learning_rate to Learning Rate, max_depth to Max Depth, min_child_weight to Min Child Weight, n_estimators to Number of Estimators, objective to Objective
    print(best_params.columns)
    best_params.rename(columns={'booster': 'Booster', 'gamma': 'Gamma', 'learning_rate': 'Learning Rate', 'max_depth': 'Max Depth', 'min_child_weight': 'Min Child Weight', 'n_estimators': 'Number of Estimators', 'objective': 'Objective'}, inplace=True)

    # Export smaller version to Excel
    # drop first col, Booster, Objective
    bp_small = best_params.drop(columns=['Booster', 'Objective', 'Previous Ratings'])
    bp_small.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Best_Params.xlsx', index=False)

    # Export to LaTeX
    print(bp_small.columns)
    for col in ['Learning Rate']:
        bp_small[col] = bp_small[col].apply(lambda x: '{:,.2f}'.format(x))
    if 'Gamma' in bp_small.columns:
        bp_small['Gamma'] = bp_small['Gamma'].apply(lambda x: '{:,.2f}'.format(x))
    for col in ['Max Depth', 'Min Child Weight', 'Number of Estimators']:
        bp_small[col] = bp_small[col].apply(lambda x: '{:,.0f}'.format(x))
    # Center all columns
    lt_string = bp_small.to_latex(index=False, column_format='c' * len(bp_small.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Models_Best_Params.tex', 'w') as f:
        f.write(latex_with_font_size) 

    # Append to best_params_dfs
    best_params_dfs.append(best_params)

# Concatenate best_params_dfs
best_params = pd.concat(best_params_dfs)
print(best_params)

# Export to Excel
best_params.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/Most_Complex_Models_Best_Params.xlsx', index=False)

# Export to Latex
#best_params.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', index=False)

# Export to LaTeX
for col in ['Gamma', 'Learning Rate']:
    best_params[col] = best_params[col].apply(lambda x: '{:,.2f}'.format(x))
for col in ['Max Depth', 'Min Child Weight', 'Number of Estimators']:
    best_params[col] = best_params[col].apply(lambda x: '{:,.0f}'.format(x))
# Center all columns
lt_string = best_params.to_latex(index=False, column_format='c' * len(best_params.columns), escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/' + classifier_name + '/Tables/Most_Complex_Models_Best_Params.tex', 'w') as f:
    f.write(latex_with_font_size)

{'booster': 'gbtree', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 20, 'min_child_weight': 5, 'n_estimators': 1000, 'objective': 'multi:softprob'}
XGBoost
   Previous Ratings booster  gamma  learning_rate  max_depth  \
0  Exclude Previous  gbtree    0.1            0.1         20   

   min_child_weight  n_estimators       objective  
0                 5          1000  multi:softprob  
Index(['Previous Ratings', 'booster', 'gamma', 'learning_rate', 'max_depth',
       'min_child_weight', 'n_estimators', 'objective'],
      dtype='object')
Index(['Gamma', 'Learning Rate', 'Max Depth', 'Min Child Weight',
       'Number of Estimators'],
      dtype='object')
{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'objective': 'multi:softprob'}
XGBoost
   Previous Ratings booster  learning_rate  max_depth  min_child_weight  \
0  Include Previous  gbtree           0.01          3                 3   

   n_estimators       objective  
0  

## Most Complex Permutation Importance

In [24]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load data
    permutation_importance = pd.read_parquet('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_permutation_importance.parquet')
    permutation_importance = permutation_importance.sort_values('mean',ascending=False)
    # Set columns to "Feature", "Mean", "Standard Deviation"
    permutation_importance.columns = ['Feature', 'Mean', 'Standard Deviation']
     # Strip 'cat__' and 'num__' from Feature
    permutation_importance['Feature'] = permutation_importance['Feature'].str.replace('cat__', '').str.replace('num__', '')
    # Use variable_index to get feature names
    permutation_importance = permutation_importance.merge(variable_index[['column_name', 'Clean Column Name']], left_on='Feature', right_on='column_name', how='left')
    # Set Clean_Column_Name to Feature if no match
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].fillna(permutation_importance['Feature'])
    # Drop Feature and column_name
    permutation_importance = permutation_importance.drop(columns=['Feature', 'column_name'])
    # Clean up names for categorical columns
    previous_rating_mapping = {'rating_on_previous_fixed_quarter_date_AAA': 'Rating on Previous Fixed Quarter Date AAA',
                                'rating_on_previous_fixed_quarter_date_AA': 'Rating on Previous Fixed Quarter Date AA',
                                'rating_on_previous_fixed_quarter_date_A': 'Rating on Previous Fixed Quarter Date A',
                                'rating_on_previous_fixed_quarter_date_BBB': 'Rating on Previous Fixed Quarter Date BBB',
                                'rating_on_previous_fixed_quarter_date_BB': 'Rating on Previous Fixed Quarter Date BB',
                                'rating_on_previous_fixed_quarter_date_B': 'Rating on Previous Fixed Quarter Date B',
                                'rating_on_previous_fixed_quarter_date_CCC': 'Rating on Previous Fixed Quarter Date CCC',
                                'rating_on_previous_fixed_quarter_date_CC': 'Rating on Previous Fixed Quarter Date CC',
                                'rating_on_previous_fixed_quarter_date_C': 'Rating on Previous Fixed Quarter Date C',
                                'rating_on_previous_fixed_quarter_date_D': 'Rating on Previous Fixed Quarter Date D'}
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].replace(previous_rating_mapping)
    # Replace 'Sector_' with 'Sector: '
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].str.replace('Sector_', 'Sector: ')
    # Rename Clean Column Name to Feature
    permutation_importance = permutation_importance.rename(columns={'Clean Column Name': 'Feature'})
    # Reorder columns to put Feature first
    permutation_importance = permutation_importance[['Feature', 'Mean', 'Standard Deviation']]
    # Rename Mean to 'Mean Accuracy Drop'
    permutation_importance = permutation_importance.rename(columns={'Mean': 'Mean Accuracy Drop'})
    # Rename Feature to 'Permuted Feature'
    permutation_importance = permutation_importance.rename(columns={'Feature': 'Permuted Feature'})
    # Get top 15
    pi_top_15 = permutation_importance.head(15)

    # If includ_previous set to include_previous_pi_top_15
    if include_exclude_previous == 'include_previous_':
        include_previous_pi_top_15 = pi_top_15
    else:
        exclude_previous_pi_top_15 = pi_top_15

    # Export to Excel
    pi_top_15.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Permutation_Importance_Top_15.xlsx', index=False)

    # Export to Latex
    #pi_top_15.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Permutation_Importance_Top_15.tex', index=False)

    # Export to LaTeX
    # Center all columns
    lt_string = pi_top_15.to_latex(index=False, column_format='c' * len(pi_top_15.columns), escape=False)
    latex_with_font_size = "\\tiny\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Permutation_Importance_Top_15.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(pi_top_15)

                               Permuted Feature  Mean Accuracy Drop  \
0                             Retained Earnings            0.043819   
1                         Market Capitalization            0.035169   
2                                Dividends Paid            0.021455   
3                                    Debt Ratio            0.009987   
4                                  Common Stock            0.009693   
5                                       Ratio E            0.009535   
6              Other Total Stockholders' Equity            0.009288   
7                     Total Current Liabilities            0.006888   
8                     Inventory (Balance Sheet)            0.006802   
9                          Total Current Assets            0.006684   
10  Selling General and Administrative Expenses            0.006031   
11                             Interest Expense            0.005973   
12                 Net Property Plant Equipment            0.005729   
13    

In [25]:
include_previous_pi_top_15 = (include_previous_pi_top_15.reset_index(drop=True)
                            .style
                            .format(precision=6, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(include_previous_pi_top_15, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'include_previous_' + 'Most_Complex_Model_Permutation_Importance_Top_15.png')
#include_previous_pi_top_15

exclude_previous_pi_top_15 = (exclude_previous_pi_top_15.reset_index(drop=True)
                            .style
                            .format(precision=6, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(exclude_previous_pi_top_15, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'exclude_previous_' + 'Most_Complex_Model_Permutation_Importance_Top_15.png')
#exclude_previous_pi_top_15

## Change Model Comparison Tables

In [26]:
# Iterate over just models '_' versus _smote_
for spec in ['', 'smote_']:

    # Create list of df rows
    model_comparison_rows = []
    majority_baselines = []
    for mn, clean_model_name in zip(['rating_change_model_1', 'rating_change_model_2', 'rating_change_model_3'], ["Altman's Z", 'Financial Variables and Sector', 'Financial Variables, Sector, and NLP Features']):
        model_comparison_row, majority_baseline = get_model_comparison_row(spec + mn, clean_model_name)
        model_comparison_rows.append(model_comparison_row)
        majority_baselines.append(majority_baseline)

    # Concatenate rows
    model_comparison_df = pd.concat(model_comparison_rows)

    # Check majority baselines are the same
    print('Majority baselines are the same:', all([majority_baseline == majority_baselines[0] for majority_baseline in majority_baselines]))
    # Add row with Model/Baseline = 'Most Common Class Baseline' and Accuracy = majority_baseline[0]
    model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({
        'Model/Baseline': ['Most Common Class Baseline'],
        'Accuracy': [majority_baselines[0]],
        'Weighted Average Precision': [''],
        'Weighted Average Recall': [''],
        'F1 Score': [''],
        'Share 1 Rating Or Less From Actual': ['']
    })])
    # Drop 'Share 1 Rating Or Less From Actual' column
    model_comparison_df = model_comparison_df.drop(columns=['Share 1 Rating Or Less From Actual'])

    # if smote set as change_smote_model_comparison_df
    if spec == 'smote_':
        change_smote_model_comparison_df = model_comparison_df
    else:
        change_model_comparison_df = model_comparison_df

    # Export to Excel
    model_comparison_df.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/change_' + spec + 'model_comparison_df.xlsx', index = False)

    # Export to LaTeX
    # Format columns
    for col in model_comparison_df.columns:
        model_comparison_df[col] = model_comparison_df[col].apply(lambda x: f'{x:.4f}' if isinstance(x, float) else x)
    # Rename 'Share 1 Rating Or Less From Actual' to 'Share $le$ Rating From Actual'
    #model_comparison_df.rename(columns={'Share 1 Rating Or Less From Actual': 'Share $\\le$ 1 Rating From Actual'}, inplace=True)
    # Center all columns
    lt_string = model_comparison_df.to_latex(index=False, column_format='c' * len(model_comparison_df.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/change_' + spec + 'model_comparison_df.tex', 'w') as f:
        f.write(latex_with_font_size)

    # Smaller version latex - keep only columns 'Model/Baseline', 'Accuracy'
    model_comparison_df_smaller = model_comparison_df[['Model/Baseline', 'Accuracy']]
    # Center all columns
    lt_string = model_comparison_df_smaller.to_latex(index=False, column_format='c' * len(model_comparison_df_smaller.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/change_' + spec + 'model_comparison_df_smaller.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(model_comparison_df)

exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True
                                  Model/Baseline Accuracy  \
0                                     Altman's Z   0.9186   
0                 Financial Variables and Sector   0.9517   
0  Financial Variables, Sector, and NLP Features   0.9535   
0                     Most Common Class Baseline   0.9535   

  Weighted Average Precision Weighted Average Recall F1 Score  
0                     0.9111                  0.9186   0.9148  
0                     0.9091                  0.9517   0.9299  
0                     0.9091                  0.9535   0.9308  
0                                                              
exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True
                                  Model/Baseli

In [27]:
# Styling
#print(exclude_previous_model_comparison_df)
change_model_comparison_df = (change_model_comparison_df.reset_index(drop=True)[['Model/Baseline', 'Accuracy']]
                            .style
                            .format(precision=2, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(change_model_comparison_df, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'change_model_comparison_df.png')
#change_model_comparison_df

change_smote_model_comparison_df = (change_smote_model_comparison_df.reset_index(drop=True)[['Model/Baseline', 'Accuracy']]
                            .style
                            .format(precision=2, thousands=",", decimal=".")
                            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                            .hide()
                            .set_properties(**{'text-align': 'center'}))

dfi.export(change_smote_model_comparison_df, '../../../../Output/Modelling/' + classifier_name + '/Tables/' + 'change_smote_model_comparison_df.png')
#change_smote_model_comparison_df

## Change Model Classification Report

In [28]:
# Load classificiation report from pickle
classification_report = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/smote_rating_change_model_3/smote_rating_change_model_3_classification_report.pkl')
print(classification_report)

# Convert classification report string to dataframe
classification_report_lines = classification_report.split('\n')
# split on spaces within and drop blanks
classification_report_data = [line.split() for line in classification_report_lines if line]
# drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
classification_report_data = [line for line in classification_report_data if line[0] not in ['precision', 'accuracy', 'macro', 'weighted']]
# Stack list of rows into dataframe
classification_report_data = pd.DataFrame(classification_report_data)
# Merge columns 0 to 5 into one column on spaces
classification_report_data[0] = classification_report_data[[0, 1, 2, 3, 4, 5]].apply(lambda x: ' '.join(x), axis=1)
# Drop columns 1 to 4
classification_report_data = classification_report_data.drop(columns=[1, 2, 3, 4, 5])
print(classification_report_data)
# Set columns to "Change", "Precision", "Recall", "F1-Score", "Support"
classification_report_data.columns = ['Change', 'Precision', 'Recall', 'F1-Score', 'Support']
# Recode change: 'Downgrade Since Last Fixed Quarter Date' as 'Downgrade', 'Same As Last Fixed Quarter Date' as 'Same', 'Upgrade Since Last Fixed Quarter Date' as 'Upgrade'
change_mapping = {'Downgrade Since Last Fixed Quarter Date': 'Downgrade', 'Same As Last Fixed Quarter Date': 'Same', 'Upgrade Since Last Fixed Quarter Date': 'Upgrade'}
classification_report_data['Change'] = classification_report_data['Change'].map(change_mapping)
# Sort by Change in correct order: Downgrade, Same, Upgrade
rating_map = {'Downgrade': 0, 'Same': 1, 'Upgrade': 2}
classification_report_data['Change Num'] = classification_report_data['Change'].map(rating_map)
classification_report_data = classification_report_data.sort_values(by='Change Num').drop(columns='Change Num')
# Add commas in Support
classification_report_data['Support'] = classification_report_data['Support'].apply(lambda x: '{:,}'.format(int(x)))
print(classification_report_data)

# Export to Excel
classification_report_data.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/SMOTE_Most_Complex_Model_Classification_Report.xlsx', index=False)

# Export to LaTeX
# Format columns
for col in classification_report_data.columns:
    classification_report_data[col] = classification_report_data[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
# Center all columns
lt_string = classification_report_data.to_latex(index=False, column_format='c' * 5, escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/' + classifier_name + '/Tables/SMOTE_Most_Complex_Model_Classification_Report.tex', 'w') as f:
    f.write(latex_with_font_size)

                                         precision    recall  f1-score   support

Downgrade Since Last Fixed Quarter Date     0.0000    0.0000    0.0000        20
        Same As Last Fixed Quarter Date     0.9533    0.9953    0.9738      1066
  Upgrade Since Last Fixed Quarter Date     0.0000    0.0000    0.0000        32

                               accuracy                         0.9490      1118
                              macro avg     0.3178    0.3318    0.3246      1118
                           weighted avg     0.9089    0.9490    0.9285      1118

                                         0       6       7       8     9
0  Downgrade Since Last Fixed Quarter Date  0.0000  0.0000  0.0000    20
1          Same As Last Fixed Quarter Date  0.9533  0.9953  0.9738  1066
2    Upgrade Since Last Fixed Quarter Date  0.0000  0.0000  0.0000    32
      Change Precision  Recall F1-Score Support
0  Downgrade    0.0000  0.0000   0.0000      20
1       Same    0.9533  0.9953   0.9738   