# Create Tables

In [51]:
# Packages
import pandas as pd

## Set Classifier Name

In [52]:
classifier_name = 'Logistic Regression'

## List of Rating Models and Most Complex Model

In [53]:
model_names = ['rating_model_1', 'rating_model_2', 'rating_model_3']
clean_model_names = ["Altman's Z", 'Financial Variables and Sector', 'Financial Variables, Sector, and NLP Features']
# set most_complex_model 
most_complex_model = 'rating_model_3'

## Variable Index

In [54]:
# Load variable index
variable_index = pd.read_excel('../../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,Y,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,Y,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...
167,grossProfitRatio_diff,Difference in Gross Profit Ratio from prior fi...,Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
168,ebitdaratio_diff,Difference in EBITDA Ratio from prior fixed qu...,Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
169,operatingIncomeRatio_diff,Difference in Operating Income Ratio from prio...,Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
170,incomeBeforeTaxRatio_diff,Difference in Income Before Tax Ratio from pri...,Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X


## Model Comparison

In [55]:
def get_model_comparison_row(model_name, clean_model_name):
    '''
    Given the model name and clean model name, this function returns the model comparison row.
    '''

    # Load close_exact_dict
    close_exact_dict = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_close_exact_dict.pkl')
    # Version with each item rounded to 4 decimal places
    close_exact_dict_rounded = {k: round(v, 4) for k, v in close_exact_dict.items()}
    # Unpack
    exact_predictions_share = close_exact_dict_rounded['exact_predictions_share']
    close_predictions_share = close_exact_dict_rounded['close_predictions_share']

    # Load acc_f1_majority
    acc_f1_majority = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_acc_f1_majority.pkl')
    # Version with each item rounded to 2 decimal places
    acc_f1_majority_rounded = {k: round(v, 4) for k, v in acc_f1_majority.items()}
    # Unpack
    accuracy = acc_f1_majority_rounded['accuracy']
    f1 = acc_f1_majority_rounded['f1_score']
    majority_baseline = acc_f1_majority_rounded['majority_baseline']

    # Check exact_predictions_share == accuracy
    print('exact predictions share == accuracy:', exact_predictions_share == accuracy)

    # Get weighted average precision and recall from classification report
    classification_report = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + model_name + '/' + model_name + '_classification_report.pkl')
    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] in ['weighted']]
    # Unpack
    weighted_avg_precision = classification_report_data[0][2]
    weighted_avg_recall = classification_report_data[0][3]

    # Create dataframe row
    model_comparison_row = pd.DataFrame({
        'Model/Baseline': [clean_model_name],
        'Accuracy': [accuracy],
        'Weighted Average Precision': [weighted_avg_precision],
        'Weighted Average Recall': [weighted_avg_recall],
        'F1 Score': [f1],
        'Share 1 Rating Or Less From Actual': [close_predictions_share]
    })

    # Return row
    return model_comparison_row, majority_baseline

## Model Comparison Table - Include and Exclude Previous Rating Versions

In [56]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Create list of df rows
    model_comparison_rows = []
    majority_baselines = []
    for model_name, clean_model_name in zip(model_names, clean_model_names):
        model_comparison_row, majority_baseline = get_model_comparison_row(include_exclude_previous + model_name, clean_model_name)
        model_comparison_rows.append(model_comparison_row)
        majority_baselines.append(majority_baseline)

    # Concatenate rows
    model_comparison_df = pd.concat(model_comparison_rows)

    # Check majority baselines are the same
    print('Majority baselines are the same:', all([majority_baseline == majority_baselines[0] for majority_baseline in majority_baselines]))
    # Add row with Model/Baseline = 'Majority Baseline' and Accuracy = majority_baseline[0]
    model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({
        'Model/Baseline': ['Majority Baseline'],
        'Accuracy': [majority_baselines[0]],
        'Weighted Average Precision': [''],
        'Weighted Average Recall': [''],
        'F1 Score': [''],
        'Share 1 Rating Or Less From Actual': ['']
    })])

    # Export to Excel
    model_comparison_df.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df.xlsx', index = False)

    # Export to LaTeX
    # Format columns
    for col in model_comparison_df.columns:
        model_comparison_df[col] = model_comparison_df[col].apply(lambda x: f'{x:.4f}' if isinstance(x, float) else x)
    # Rename 'Share 1 Rating Or Less From Actual' to 'Share $le$ Rating From Actual'
    model_comparison_df.rename(columns={'Share 1 Rating Or Less From Actual': 'Share $\\le$ 1 Rating From Actual'}, inplace=True)
    # Center all columns
    lt_string = model_comparison_df.to_latex(index=False, column_format='c' * len(model_comparison_df.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'model_comparison_df.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(model_comparison_df)

exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True
                                  Model/Baseline Accuracy  \
0                                     Altman's Z   0.3507   
0                 Financial Variables and Sector   0.6484   
0  Financial Variables, Sector, and NLP Features   0.6557   
0                              Majority Baseline   0.3013   

  Weighted Average Precision Weighted Average Recall F1 Score  \
0                     0.2435                  0.3507   0.2751   
0                     0.6464                  0.6484   0.6457   
0                     0.6573                  0.6557   0.6537   
0                                                               

  Share $\le$ 1 Rating From Actual  
0                           0.8022  
0                           0.9432  
0                           0.9460  
0                                   
exact predictions s

## Most Complex Classification Rep

In [57]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load classificiation report from pickle
    classification_report = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_classification_report.pkl')
    print(classification_report)

    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] not in ['precision', 'accuracy', 'macro', 'weighted']]
    # Stack list of rows into dataframe
    classification_report_data = pd.DataFrame(classification_report_data)
    # Set columns to "Rating", "Precision", "Recall", "F1-Score", "Support"
    classification_report_data.columns = ['Rating', 'Precision', 'Recall', 'F1-Score', 'Support']
    print(classification_report_data)

    # Export to Excel
    classification_report_data.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Classification_Report.xlsx', index=False)

    # Export to Latex
    #classification_report_data.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', index=False)

    # Export to LaTeX
    # Format columns
    for col in classification_report_data.columns:
        classification_report_data[col] = classification_report_data[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
    # Center all columns
    lt_string = classification_report_data.to_latex(index=False, column_format='c' * 5, escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Classification_Report.tex', 'w') as f:
        f.write(latex_with_font_size)

              precision    recall  f1-score   support

           A     0.6793    0.5841    0.6281       214
          AA     0.5882    0.4762    0.5263        42
         AAA     0.7917    0.7037    0.7451        27
           B     0.5822    0.6159    0.5986       138
          BB     0.6399    0.6678    0.6535       298
         BBB     0.6759    0.7416    0.7072       329
           C     0.7778    1.0000    0.8750         7
          CC     0.0000    0.0000    0.0000         0
         CCC     0.8095    0.5000    0.6182        34
           D     0.0000    0.0000    0.0000         3

    accuracy                         0.6557      1092
   macro avg     0.5545    0.5289    0.5352      1092
weighted avg     0.6573    0.6557    0.6537      1092

  Rating Precision  Recall F1-Score Support
0      A    0.6793  0.5841   0.6281     214
1     AA    0.5882  0.4762   0.5263      42
2    AAA    0.7917  0.7037   0.7451      27
3      B    0.5822  0.6159   0.5986     138
4     BB    0.6399  0

## Most Complex Hyperparameters

In [58]:
# List to store best parameters dfs
best_params_dfs = []

# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load pickle
    best_params = pd.read_pickle('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_best_params.pkl')
    print(best_params)

    # Convert to dataframe
    best_params = pd.DataFrame(best_params, index=[0])
    # Set columns to "C", "Class Weighting Strategy", "L1 Ratio", "Multi-Class Strategy", "Penalty", "Solver"
    best_params.columns = ['C', 'Class Weighting Strategy', 'L1 Ratio', 'Multi-Class Strategy', 'Penalty', 'Solver']
    # Replace 'Multi-Class Strategy' values
    best_params['Multi-Class Strategy'] = best_params['Multi-Class Strategy'].replace({'ovr': 'One vs Rest', 'multinomial': 'Multinomial'})
    # Replace 'Penalty' values
    best_params['Penalty'] = best_params['Penalty'].replace({'l1': 'L1', 'l2': 'L2', 'elasticnet': 'Elastic Net', 'none': 'None'})
    # Replace 'Solver' values
    best_params['Solver'] = best_params['Solver'].replace({'newton-cg': 'Newton Conjugate Gradient', 'lbfgs': 'Limited Memory Broyden–Fletcher–Goldfarb–Shanno', 'liblinear': 'Library for Large Linear Classification', 'sag': 'Stochastic Average Gradient', 'saga': 'SAGA'})
    # Replace Class Weighting Strategy values
    best_params['Class Weighting Strategy'] = best_params['Class Weighting Strategy'].replace({'balanced': 'Balanced', None: 'None'})
    
    # Column at the front for whether previous ratings are included or excluded
    best_params.insert(0, 'Previous Ratings', include_exclude_previous[:-1].replace('_', ' ').title())

    # Append to best_params_dfs
    best_params_dfs.append(best_params)

# Concatenate best_params_dfs
best_params = pd.concat(best_params_dfs)
print(best_params)

# Export to Excel
best_params.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/Most_Complex_Models_Best_Params.xlsx', index=False)

# Export to Latex
#best_params.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', index=False)

# Export to LaTeX
for col in ['C', 'L1 Ratio']:
    best_params[col] = best_params[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = best_params.to_latex(index=False, column_format='c' * len(best_params.columns), escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/' + classifier_name + '/Tables/Most_Complex_Models_Best_Params.tex', 'w') as f:
    f.write(latex_with_font_size)

{'C': 1, 'class_weight': None, 'l1_ratio': 0.0, 'multi_class': 'multinomial', 'penalty': 'elasticnet', 'solver': 'saga'}
{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
   Previous Ratings    C Class Weighting Strategy  L1 Ratio  \
0  Exclude Previous  1.0                     None       0.0   
0  Include Previous  0.1                 Balanced       1.0   

  Multi-Class Strategy      Penalty Solver  
0          Multinomial  Elastic Net   SAGA  
0          One vs Rest  Elastic Net   SAGA  


## Most Complex Permutation Importance

In [59]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Load data
    permutation_importance = pd.read_parquet('../../../../Output/Modelling/' + classifier_name + '/' + include_exclude_previous + most_complex_model + '/' + include_exclude_previous + most_complex_model + '_permutation_importance.parquet')
    permutation_importance = permutation_importance.sort_values('mean',ascending=False)
    # Set columns to "Feature", "Mean", "Standard Deviation"
    permutation_importance.columns = ['Feature', 'Mean', 'Standard Deviation']
     # Strip 'cat__' and 'num__' from Feature
    permutation_importance['Feature'] = permutation_importance['Feature'].str.replace('cat__', '').str.replace('num__', '')
    # Use variable_index to get feature names
    permutation_importance = permutation_importance.merge(variable_index[['column_name', 'Clean Column Name']], left_on='Feature', right_on='column_name', how='left')
    # Set Clean_Column_Name to Feature if no match
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].fillna(permutation_importance['Feature'])
    # Drop Feature and column_name
    permutation_importance = permutation_importance.drop(columns=['Feature', 'column_name'])
    # Clean up names for categorical columns
    previous_rating_mapping = {'rating_on_previous_fixed_quarter_date_AAA': 'Rating on Previous Fixed Quarter Date AAA',
                                'rating_on_previous_fixed_quarter_date_AA': 'Rating on Previous Fixed Quarter Date AA',
                                'rating_on_previous_fixed_quarter_date_A': 'Rating on Previous Fixed Quarter Date A',
                                'rating_on_previous_fixed_quarter_date_BBB': 'Rating on Previous Fixed Quarter Date BBB',
                                'rating_on_previous_fixed_quarter_date_BB': 'Rating on Previous Fixed Quarter Date BB',
                                'rating_on_previous_fixed_quarter_date_B': 'Rating on Previous Fixed Quarter Date B',
                                'rating_on_previous_fixed_quarter_date_CCC': 'Rating on Previous Fixed Quarter Date CCC',
                                'rating_on_previous_fixed_quarter_date_CC': 'Rating on Previous Fixed Quarter Date CC',
                                'rating_on_previous_fixed_quarter_date_C': 'Rating on Previous Fixed Quarter Date C',
                                'rating_on_previous_fixed_quarter_date_D': 'Rating on Previous Fixed Quarter Date D'}
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].replace(previous_rating_mapping)
    # Replace 'Sector_' with 'Sector: '
    permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].str.replace('Sector_', 'Sector: ')
    # Rename Clean Column Name to Feature
    permutation_importance = permutation_importance.rename(columns={'Clean Column Name': 'Feature'})
    # Reorder columns to put Feature first
    permutation_importance = permutation_importance[['Feature', 'Mean', 'Standard Deviation']]
    # Rename Mean to 'Mean Accuracy Drop'
    permutation_importance = permutation_importance.rename(columns={'Mean': 'Mean Accuracy Drop'})
    # Rename Feature to 'Permuted Feature'
    permutation_importance = permutation_importance.rename(columns={'Feature': 'Permuted Feature'})
    # Get top 15
    pi_top_15 = permutation_importance.head(15)

    # Export to Excel
    pi_top_15.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Permutation_Importance_Top_15.xlsx', index=False)

    # Export to Latex
    #pi_top_15.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Permutation_Importance_Top_15.tex', index=False)

    # Export to LaTeX
    # Center all columns
    lt_string = pi_top_15.to_latex(index=False, column_format='c' * len(pi_top_15.columns), escape=False)
    latex_with_font_size = "\\tiny\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/' + classifier_name + '/Tables/' + include_exclude_previous + 'Most_Complex_Model_Permutation_Importance_Top_15.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(pi_top_15)

                                    Permuted Feature  Mean Accuracy Drop  \
0                                            Ratio E            0.070625   
1                                       Passive Tone            0.056786   
2                                  Sector: Utilities            0.043208   
3                                   Interest Expense            0.043019   
4                                            Ratio D            0.041765   
5                                            Ratio C            0.040578   
6   Depreciation and Amortization (Income Statement)            0.038593   
7                                    Net Receivables            0.036435   
8                                         Word Count            0.035743   
9                                     Long-Term Debt            0.035463   
10                             Market Capitalization            0.031103   
11                    Goodwill and Intangible Assets            0.030084   
12          

## Changes Table

In [60]:
# We can start with output of get_model_comparison_row
starter_row, maj_baseline = get_model_comparison_row('change_model', 'Change Model')
print(starter_row)
print(maj_baseline)

# Drop Model/Baseline, Share Less Than 1 Rating From Actual
changes_table = starter_row.drop(columns=['Model/Baseline', 'Share 1 Rating Or Less From Actual'])
# Add column for maj_baseline
changes_table['Majority Baseline'] = maj_baseline

# Output to Excel
changes_table.to_excel('../../../../Output/Modelling/' + classifier_name + '/Tables/changes_table.xlsx', index=False)

# Output to Latex
#changes_table.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.tex', index=False)

# Export to LaTeX
for col in ['Accuracy', 'F1 Score', 'Majority Baseline']:
    changes_table[col] = changes_table[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = changes_table.to_latex(index=False, column_format='c' * len(changes_table.columns), escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/' + classifier_name + '/Tables/changes_table.tex', 'w') as f:
    f.write(latex_with_font_size)

changes_table


exact predictions share == accuracy: True
  Model/Baseline  Accuracy Weighted Average Precision Weighted Average Recall  \
0   Change Model    0.9556                       0.91                    0.96   

   F1 Score  Share 1 Rating Or Less From Actual  
0     0.934                                 1.0  
0.9556


Unnamed: 0,Accuracy,Weighted Average Precision,Weighted Average Recall,F1 Score,Majority Baseline
0,0.96,0.91,0.96,0.93,0.96
