# Create Tables

In [15]:
# Packages
import pandas as pd

## Variable Index

In [16]:
# Load variable index
variable_index = pd.read_excel('../../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Notes,Rating Model 1,Rating Model 2,Rating Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,,,X,X
...,...,...,...,...,...,...,...,...
154,Investment_Grade,Investment Grade,Predicted - Rating,Categorical,Whether rating is BBB or above,,,
155,rating_on_previous_fixed_quarter_date,Rating on Previous Fixed Quarter Date,Previous Rating,Categorical,Useful as a predictor,X (Previous Rating Models),X (Previous Rating Models),X (Previous Rating Models)
156,Sector,Sector,Sector,Categorical,,,X,X
157,train_test_80_20,80-20% Train Test Split,Train-Test Split,Categorical,,,,


## List of Rating Models and Most Complex Model

In [17]:
model_names = ['rating_model_1', 'rating_model_2', 'rating_model_3']
clean_model_names = ["Altman's Z", 'Financial Variables and Sector', 'Financial Variables, Sector, and NLP Features']
# set most_complex_model 
most_complex_model = 'rating_model_3'

## Model Comparison

In [18]:
def get_model_comparison_row(model_name, clean_model_name):
    '''
    Given the model name and clean model name, this function returns the model comparison row.
    '''

    # Load close_exact_dict
    close_exact_dict = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_close_exact_dict.pkl')
    # Version with each item rounded to 4 decimal places
    close_exact_dict_rounded = {k: round(v, 4) for k, v in close_exact_dict.items()}
    # Unpack
    exact_predictions_share = close_exact_dict_rounded['exact_predictions_share']
    close_predictions_share = close_exact_dict_rounded['close_predictions_share']

    # Load acc_f1_majority
    acc_f1_majority = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_acc_f1_majority.pkl')
    # Version with each item rounded to 2 decimal places
    acc_f1_majority_rounded = {k: round(v, 4) for k, v in acc_f1_majority.items()}
    # Unpack
    accuracy = acc_f1_majority_rounded['accuracy']
    f1 = acc_f1_majority_rounded['f1_score']
    majority_baseline = acc_f1_majority_rounded['majority_baseline']

    # Check exact_predictions_share == accuracy
    print('exact predictions share == accuracy:', exact_predictions_share == accuracy)

    # Get weighted average precision and recall from classification report
    classification_report = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_classification_report.pkl')
    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] in ['weighted']]
    # Unpack
    weighted_avg_precision = classification_report_data[0][2]
    weighted_avg_recall = classification_report_data[0][3]

    # Create dataframe row
    model_comparison_row = pd.DataFrame({
        'Model/Baseline': [clean_model_name],
        'Accuracy': [accuracy],
        'Weighted Average Precision': [weighted_avg_precision],
        'Weighted Average Recall': [weighted_avg_recall],
        'F1 Score': [f1],
        'Share 1 Rating Or Less From Actual': [close_predictions_share]
    })

    # Return row
    return model_comparison_row, majority_baseline

## Model Comparison Table - Include and Exclude Previous Rating Versions

In [19]:
# Iterate over include_exclude_previous
for include_exclude_previous in ['exclude_previous_', 'include_previous_']:

    # Create list of df rows
    model_comparison_rows = []
    majority_baselines = []
    for model_name, clean_model_name in zip(model_names, clean_model_names):
        model_comparison_row, majority_baseline = get_model_comparison_row(include_exclude_previous + model_name, clean_model_name)
        model_comparison_rows.append(model_comparison_row)
        majority_baselines.append(majority_baseline)

    # Concatenate rows
    model_comparison_df = pd.concat(model_comparison_rows)

    # Check majority baselines are the same
    print('Majority baselines are the same:', all([majority_baseline == majority_baselines[0] for majority_baseline in majority_baselines]))
    # Add row with Model/Baseline = 'Majority Baseline' and Accuracy = majority_baseline[0]
    model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({
        'Model/Baseline': ['Majority Baseline'],
        'Accuracy': [majority_baselines[0]],
        'Weighted Average Precision': [''],
        'Weighted Average Recall': [''],
        'F1 Score': [''],
        'Share 1 Rating Or Less From Actual': ['']
    })])

    # Export to Excel
    model_comparison_df.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/' + include_exclude_previous + 'model_comparison_df.xlsx', index = False)

    # Export to LaTeX
    # Format columns
    for col in model_comparison_df.columns:
        model_comparison_df[col] = model_comparison_df[col].apply(lambda x: f'{x:.4f}' if isinstance(x, float) else x)
    # Rename 'Share 1 Rating Or Less From Actual' to 'Share $le$ Rating From Actual'
    model_comparison_df.rename(columns={'Share 1 Rating Or Less From Actual': 'Share $\\le$ 1 Rating From Actual'}, inplace=True)
    # Center all columns
    lt_string = model_comparison_df.to_latex(index=False, column_format='c' * len(model_comparison_df.columns), escape=False)
    latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
    with open('../../../../Output/Modelling/Logistic Regression/Tables/' + include_exclude_previous + 'model_comparison_df.tex', 'w') as f:
        f.write(latex_with_font_size)

    print(model_comparison_df)

exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True
                                  Model/Baseline Accuracy  \
0                                     Altman's Z   0.3585   
0                 Financial Variables and Sector   0.6105   
0  Financial Variables, Sector, and NLP Features   0.6122   
0                              Majority Baseline   0.3159   

  Weighted Average Precision Weighted Average Recall F1 Score  \
0                       0.25                    0.36   0.2891   
0                       0.60                    0.61   0.6002   
0                       0.61                    0.61   0.6034   
0                                                               

  Share $\le$ 1 Rating From Actual  
0                           0.8119  
0                           0.9379  
0                           0.9388  
0                                   
exact predictions s

## Most Complex Classification Rep

In [20]:
# Load classificiation report from pickle
classification_report = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_classification_report.pkl')
print(classification_report)

# Convert classification report string to dataframe
classification_report_lines = classification_report.split('\n')
# split on spaces within and drop blanks
classification_report_data = [line.split() for line in classification_report_lines if line]
# drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
classification_report_data = [line for line in classification_report_data if line[0] not in ['precision', 'accuracy', 'macro', 'weighted']]
# Stack list of rows into dataframe
classification_report_data = pd.DataFrame(classification_report_data)
# Set columns to "Rating", "Precision", "Recall", "F1-Score", "Support"
classification_report_data.columns = ['Rating', 'Precision', 'Recall', 'F1-Score', 'Support']
print(classification_report_data)

# Export to Excel
classification_report_data.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.xlsx', index=False)

# Export to Latex
#classification_report_data.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', index=False)

# Export to LaTeX
# Format columns
for col in classification_report_data.columns:
    classification_report_data[col] = classification_report_data[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
# Center all columns
lt_string = classification_report_data.to_latex(index=False, column_format='c' * 5, escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', 'w') as f:
    f.write(latex_with_font_size)

FileNotFoundError: [Errno 2] No such file or directory: '../../../../Output/Modelling/Logistic Regression/rating_model_3/rating_model_3_classification_report.pkl'

## Most Complex Hyperparameters

In [None]:
# Load pickle '../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_best_params.pkl'
best_params = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_best_params.pkl')
print(best_params)

# Convert to dataframe
best_params = pd.DataFrame(best_params, index=[0])
# Set columns to "C", "Class Weighting Strategy", "L1 Ratio", "Multi-Class Strategy", "Penalty", "Solver"
best_params.columns = ['C', 'Class Weighting Strategy', 'L1 Ratio', 'Multi-Class Strategy', 'Penalty', 'Solver']
# Replace 'Multi-Class Strategy' values
best_params['Multi-Class Strategy'] = best_params['Multi-Class Strategy'].replace({'ovr': 'One vs Rest', 'multinomial': 'Multinomial'})
# Replace 'Penalty' values
best_params['Penalty'] = best_params['Penalty'].replace({'l1': 'L1', 'l2': 'L2', 'elasticnet': 'Elastic Net', 'none': 'None'})
# Replace 'Solver' values
best_params['Solver'] = best_params['Solver'].replace({'newton-cg': 'Newton Conjugate Gradient', 'lbfgs': 'Limited Memory Broyden–Fletcher–Goldfarb–Shanno', 'liblinear': 'Library for Large Linear Classification', 'sag': 'Stochastic Average Gradient', 'saga': 'SAGA'})
# Replace Class Weighting Strategy values
best_params['Class Weighting Strategy'] = best_params['Class Weighting Strategy'].replace({'balanced': 'Balanced', None: 'None'})
print(best_params)

# Export to Excel
best_params.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.xlsx', index=False)

# Export to Latex
#best_params.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', index=False)

# Export to LaTeX
for col in ['C', 'L1 Ratio']:
    best_params[col] = best_params[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = best_params.to_latex(index=False, column_format='c' * len(best_params.columns), escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', 'w') as f:
    f.write(latex_with_font_size)

{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
     C Class Weighting Strategy  L1 Ratio Multi-Class Strategy      Penalty  \
0  0.1                 Balanced       1.0          One vs Rest  Elastic Net   

  Solver  
0   SAGA  


## Most Complex Permutation Importance

In [None]:
# Load '../../../../Output/Modelling/Logistic Regression/rating_model_4/rating_model_4_permutation_importance.parquet'
permutation_importance = pd.read_parquet('../../../../Output/Modelling/Logistic Regression/rating_model_4/rating_model_4_permutation_importance.parquet')
permutation_importance = permutation_importance.sort_values('mean',ascending=False)
# Set columns to "Feature", "Mean", "Standard Deviation"
permutation_importance.columns = ['Feature', 'Mean', 'Standard Deviation']
# Use variable_index to get feature names
permutation_importance = permutation_importance.merge(variable_index[['column_name', 'Clean Column Name']], left_on='Feature', right_on='column_name', how='left')
# Set Clean_Column_Name to Feature if no match
permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].fillna(permutation_importance['Feature'])
# Drop Feature and column_name
permutation_importance = permutation_importance.drop(columns=['Feature', 'column_name'])
# Clean up names for categorical columns
previous_rating_mapping = {'cat__rating_on_previous_fixed_quarter_date_AAA': 'Rating on Previous Fixed Quarter Date AAA',
                            'cat__rating_on_previous_fixed_quarter_date_AA': 'Rating on Previous Fixed Quarter Date AA',
                            'cat__rating_on_previous_fixed_quarter_date_A': 'Rating on Previous Fixed Quarter Date A',
                            'cat__rating_on_previous_fixed_quarter_date_BBB': 'Rating on Previous Fixed Quarter Date BBB',
                            'cat__rating_on_previous_fixed_quarter_date_BB': 'Rating on Previous Fixed Quarter Date BB',
                            'cat__rating_on_previous_fixed_quarter_date_B': 'Rating on Previous Fixed Quarter Date B',
                            'cat__rating_on_previous_fixed_quarter_date_CCC': 'Rating on Previous Fixed Quarter Date CCC',
                            'cat__rating_on_previous_fixed_quarter_date_CC': 'Rating on Previous Fixed Quarter Date CC',
                            'cat__rating_on_previous_fixed_quarter_date_C': 'Rating on Previous Fixed Quarter Date C',
                            'cat__rating_on_previous_fixed_quarter_date_D': 'Rating on Previous Fixed Quarter Date D'}
permutation_importance['Clean Column Name'] = permutation_importance['Clean Column Name'].replace(previous_rating_mapping)
# Rename Clean Column Name to Feature
permutation_importance = permutation_importance.rename(columns={'Clean Column Name': 'Feature'})
# Reorder columns to put Feature first
permutation_importance = permutation_importance[['Feature', 'Mean', 'Standard Deviation']]
# Rename Mean to 'Mean Accuracy Drop'
permutation_importance = permutation_importance.rename(columns={'Mean': 'Mean Accuracy Drop'})
# Rename Feature to 'Permuted Feature'
permutation_importance = permutation_importance.rename(columns={'Feature': 'Permuted Feature'})
# Get top 15
pi_top_15 = permutation_importance.head(15)

# Export to Excel
pi_top_15.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Permutation_Importance_Top_15.xlsx', index=False)

# Export to Latex
#pi_top_15.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Permutation_Importance_Top_15.tex', index=False)

# Export to LaTeX
# Center all columns
lt_string = pi_top_15.to_latex(index=False, column_format='c' * len(pi_top_15.columns), escape=False)
latex_with_font_size = "\\tiny\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Permutation_Importance_Top_15.tex', 'w') as f:
    f.write(latex_with_font_size)

pi_top_15

Unnamed: 0,Permuted Feature,Mean Accuracy Drop,Standard Deviation
0,Rating on Previous Fixed Quarter Date BBB,0.281622,0.010257
1,Rating on Previous Fixed Quarter Date BB,0.230148,0.009061
2,Rating on Previous Fixed Quarter Date A,0.107111,0.005848
3,Rating on Previous Fixed Quarter Date B,0.079304,0.004862
4,Rating on Previous Fixed Quarter Date AA,0.013898,0.002093
5,Rating on Previous Fixed Quarter Date CCC,0.012949,0.001282
6,Total Non-Current Liabilities,0.000867,0.000612
7,Total Stockholders' Equity,0.000809,0.000366
8,Research and Development Expenses,0.000789,0.000284
9,Net Receivables,0.000779,0.000314


## Changes Table

In [None]:
# We can start with output of get_model_comparison_row
starter_row, maj_baseline = get_model_comparison_row('change_model', 'Change Model')
print(starter_row)
print(maj_baseline)

# Drop Model/Baseline, Share Less Than 1 Rating From Actual
changes_table = starter_row.drop(columns=['Model/Baseline', 'Share 1 Rating Or Less From Actual'])
# Add column for maj_baseline
changes_table['Majority Baseline'] = maj_baseline

# Output to Excel
changes_table.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.xlsx', index=False)

# Output to Latex
#changes_table.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.tex', index=False)

# Export to LaTeX
for col in ['Accuracy', 'F1 Score', 'Majority Baseline']:
    changes_table[col] = changes_table[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = changes_table.to_latex(index=False, column_format='c' * len(changes_table.columns), escape=False)
latex_with_font_size = "\\footnotesize\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.tex', 'w') as f:
    f.write(latex_with_font_size)

changes_table


exact predictions share == accuracy: True
  Model/Baseline  Accuracy Weighted Average Precision Weighted Average Recall  \
0   Change Model      0.96                       0.91                    0.96   

   F1 Score  Share 1 Rating Or Less From Actual  
0      0.93                                 1.0  
0.96


Unnamed: 0,Accuracy,Weighted Average Precision,Weighted Average Recall,F1 Score,Majority Baseline
0,0.96,0.91,0.96,0.93,0.96
