# Create Tables

In [113]:
# Packages
import pandas as pd

## List of Rating Models and Most Complex Model

In [114]:
model_names = ['rating_model_1', 'rating_model_2', 'rating_model_3', 'rating_model_4']
clean_model_names = ['Rating Model 1', 'Rating Model 2', 'Rating Model 3', 'Rating Model 4']
# set most_complex_model 
most_complex_model = 'rating_model_4'

## Model Comparison

In [115]:
def get_model_comparison_row(model_name, clean_model_name):
    '''
    Given the model name and clean model name, this function returns the model comparison row.
    '''

    # Load close_exact_dict
    close_exact_dict = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_close_exact_dict.pkl')
    # Version with each item rounded to 2 decimal places
    close_exact_dict_rounded = {k: round(v, 2) for k, v in close_exact_dict.items()}
    # Unpack
    exact_predictions_share = close_exact_dict_rounded['exact_predictions_share']
    close_predictions_share = close_exact_dict_rounded['close_predictions_share']

    # Load acc_f1_majority
    acc_f1_majority = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_acc_f1_majority.pkl')
    # Version with each item rounded to 2 decimal places
    acc_f1_majority_rounded = {k: round(v, 2) for k, v in acc_f1_majority.items()}
    # Unpack
    accuracy = acc_f1_majority_rounded['accuracy']
    f1 = acc_f1_majority_rounded['f1_score']
    majority_baseline = acc_f1_majority_rounded['majority_baseline']

    # Check exact_predictions_share == accuracy
    print('exact predictions share == accuracy:', exact_predictions_share == accuracy)

    # Get weighted average precision and recall from classification report
    classification_report = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + model_name + '/' + model_name + '_classification_report.pkl')
    # Convert classification report string to dataframe
    classification_report_lines = classification_report.split('\n')
    # split on spaces within and drop blanks
    classification_report_data = [line.split() for line in classification_report_lines if line]
    # drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
    classification_report_data = [line for line in classification_report_data if line[0] in ['weighted']]
    # Unpack
    weighted_avg_precision = classification_report_data[0][2]
    weighted_avg_recall = classification_report_data[0][3]

    # Create dataframe row
    model_comparison_row = pd.DataFrame({
        'Model/Baseline': [clean_model_name],
        'Accuracy': [accuracy],
        'Weighted Average Precision': [weighted_avg_precision],
        'Weighted Average Recall': [weighted_avg_recall],
        'F1 Score': [f1],
        'Share 1 Rating Or Less From Actual': [close_predictions_share]
    })

    # Return row
    return model_comparison_row, majority_baseline

In [116]:
# Create list of df rows
model_comparison_rows = []
majority_baselines = []
for model_name, clean_model_name in zip(model_names, clean_model_names):
    model_comparison_row, majority_baseline = get_model_comparison_row(model_name, clean_model_name)
    model_comparison_rows.append(model_comparison_row)
    majority_baselines.append(majority_baseline)

# Concatenate rows
model_comparison_df = pd.concat(model_comparison_rows)

# Check majority baselines are the same
print('Majority baselines are the same:', all([majority_baseline == majority_baselines[0] for majority_baseline in majority_baselines]))
# Add row with Model/Baseline = 'Majority Baseline' and Accuracy = majority_baseline[0]
model_comparison_df = pd.concat([model_comparison_df, pd.DataFrame({
    'Model/Baseline': ['Majority Baseline'],
    'Accuracy': [majority_baselines[0]],
    'Weighted Average Precision': [''],
    'Weighted Average Recall': [''],
    'F1 Score': [''],
    'Share 1 Rating Or Less From Actual': ['']
})])

# Export to Excel
model_comparison_df.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/model_comparison_df.xlsx', index = False)

# Export to Latex
#model_comparison_df.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/model_comparison_df.tex', index = False)

# Export to LaTeX
# Format columns
for col in model_comparison_df.columns:
    model_comparison_df[col] = model_comparison_df[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
# Center all columns
lt_string = model_comparison_df.to_latex(index=False, column_format='x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', escape=False)
latex_with_font_size = "\\small\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/model_comparison_df.tex', 'w') as f:
    f.write(latex_with_font_size)

model_comparison_df

exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
exact predictions share == accuracy: True
Majority baselines are the same: True


Unnamed: 0,Model/Baseline,Accuracy,Weighted Average Precision,Weighted Average Recall,F1 Score,Share 1 Rating Or Less From Actual
0,Rating Model 1,0.36,0.3,0.36,0.26,0.82
0,Rating Model 2,0.51,0.49,0.51,0.46,0.89
0,Rating Model 3,0.95,0.95,0.95,0.95,0.99
0,Rating Model 4,0.95,0.95,0.95,0.95,0.99
0,Majority Baseline,0.32,,,,


## Most Complex Classification Rep

In [117]:
# Load classificiation report from pickle
classification_report = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_classification_report.pkl')
print(classification_report)

# Convert classification report string to dataframe
classification_report_lines = classification_report.split('\n')
# split on spaces within and drop blanks
classification_report_data = [line.split() for line in classification_report_lines if line]
# drop lists begining with 'precision', 'accuracy', 'macro', 'weighted'
classification_report_data = [line for line in classification_report_data if line[0] not in ['precision', 'accuracy', 'macro', 'weighted']]
# Stack list of rows into dataframe
classification_report_data = pd.DataFrame(classification_report_data)
# Set columns to "Rating", "Precision", "Recall", "F1-Score", "Support"
classification_report_data.columns = ['Rating', 'Precision', 'Recall', 'F1-Score', 'Support']
print(classification_report_data)

# Export to Excel
classification_report_data.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.xlsx', index=False)

# Export to Latex
#classification_report_data.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', index=False)

# Export to LaTeX
# Format columns
for col in classification_report_data.columns:
    classification_report_data[col] = classification_report_data[col].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
# Center all columns
lt_string = classification_report_data.to_latex(index=False, column_format='x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', escape=False)
latex_with_font_size = "\\small\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Classification_Report.tex', 'w') as f:
    f.write(latex_with_font_size)

              precision    recall  f1-score   support

         AAA       0.80      0.84      0.82        19
          AA       0.86      0.88      0.87        43
           A       0.93      0.92      0.92       219
         BBB       0.96      0.97      0.97       356
          BB       0.98      0.98      0.98       313
           B       0.97      0.95      0.96       144
         CCC       0.93      0.93      0.93        27
          CC       0.50      1.00      0.67         1
           C       1.00      0.67      0.80         3
           D       1.00      1.00      1.00         2

    accuracy                           0.95      1127
   macro avg       0.89      0.91      0.89      1127
weighted avg       0.95      0.95      0.95      1127

  Rating Precision Recall F1-Score Support
0    AAA      0.80   0.84     0.82      19
1     AA      0.86   0.88     0.87      43
2      A      0.93   0.92     0.92     219
3    BBB      0.96   0.97     0.97     356
4     BB      0.98   0.98 

## Most Complex Hyperparameters

In [118]:
# Load pickle '../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_best_params.pkl'
best_params = pd.read_pickle('../../../../Output/Modelling/Logistic Regression/' + most_complex_model + '/' + most_complex_model + '_best_params.pkl')
print(best_params)

# Convert to dataframe
best_params = pd.DataFrame(best_params, index=[0])
# Set columns to "C", "Class Weighting Strategy", "L1 Ratio", "Multi-Class Strategy", "Penalty", "Solver"
best_params.columns = ['C', 'Class Weighting Strategy', 'L1 Ratio', 'Multi-Class Strategy', 'Penalty', 'Solver']
# Replace 'Multi-Class Strategy' values
best_params['Multi-Class Strategy'] = best_params['Multi-Class Strategy'].replace({'ovr': 'One vs Rest', 'multinomial': 'Multinomial'})
# Replace 'Penalty' values
best_params['Penalty'] = best_params['Penalty'].replace({'l1': 'L1', 'l2': 'L2', 'elasticnet': 'Elastic Net', 'none': 'None'})
# Replace 'Solver' values
best_params['Solver'] = best_params['Solver'].replace({'newton-cg': 'Newton Conjugate Gradient', 'lbfgs': 'Limited Memory Broyden–Fletcher–Goldfarb–Shanno', 'liblinear': 'Library for Large Linear Classification', 'sag': 'Stochastic Average Gradient', 'saga': 'SAGA'})
# Replace Class Weighting Strategy values
best_params['Class Weighting Strategy'] = best_params['Class Weighting Strategy'].replace({'balanced': 'Balanced', None: 'None'})
print(best_params)

# Export to Excel
best_params.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.xlsx', index=False)

# Export to Latex
#best_params.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', index=False)

# Export to LaTeX
for col in ['C', 'L1 Ratio']:
    best_params[col] = best_params[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = best_params.to_latex(index=False, column_format='x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', escape=False)
latex_with_font_size = "\\small\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/Most_Complex_Model_Best_Params.tex', 'w') as f:
    f.write(latex_with_font_size)

{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 1.0, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
     C Class Weighting Strategy  L1 Ratio Multi-Class Strategy      Penalty  \
0  0.1                 Balanced       1.0          One vs Rest  Elastic Net   

  Solver  
0   SAGA  


## Changes Table

In [119]:
# We can start with output of get_model_comparison_row
starter_row, maj_baseline = get_model_comparison_row('change_model', 'Change Model')
print(starter_row)
print(maj_baseline)

# Drop Model/Baseline, Share Less Than 1 Rating From Actual
changes_table = starter_row.drop(columns=['Model/Baseline', 'Share 1 Rating Or Less From Actual'])
# Add column for maj_baseline
changes_table['Majority Baseline'] = maj_baseline

# Output to Excel
changes_table.to_excel('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.xlsx', index=False)

# Output to Latex
#changes_table.to_latex('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.tex', index=False)

# Export to LaTeX
for col in ['Accuracy', 'F1 Score', 'Majority Baseline']:
    changes_table[col] = changes_table[col].apply(lambda x: '{:,.2f}'.format(x))
# Center all columns
lt_string = changes_table.to_latex(index=False, column_format='x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', escape=False)
latex_with_font_size = "\\small\n" + lt_string + "\n\\normalsize"
with open('../../../../Output/Modelling/Logistic Regression/Tables/changes_table.tex', 'w') as f:
    f.write(latex_with_font_size)

changes_table


exact predictions share == accuracy: True
  Model/Baseline  Accuracy Weighted Average Precision Weighted Average Recall  \
0   Change Model      0.96                       0.91                    0.96   

   F1 Score  Share 1 Rating Or Less From Actual  
0      0.93                                 1.0  
0.96


Unnamed: 0,Accuracy,Weighted Average Precision,Weighted Average Recall,F1 Score,Majority Baseline
0,0.96,0.91,0.96,0.93,0.96
