# Statistical Tests for Accuracy Differences

Run McNemar's test to see if accuracy differences are significant.

In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar

## Function

In [2]:
def run_test(merged_data):
    '''
    Run McNemar's Test on two models' predictions.

    Parameters:
    - merged_data: a pandas DataFrame with columns 'ticker', 'fixed_quarter_date', and 'correct_model_1' and 'correct_model_2', which are boolean values indicating whether the prediction was correct for each model.
    '''

    print('model 1 accuracy: ' + str(merged_data['correct_model_1'].mean()))
    print('model 2 accuracy: ' + str(merged_data['correct_model_2'].mean()))

    # Fill contingency table cells
    m1_correct_m2_correct = merged_data[(merged_data['correct_model_1'] == True) & (merged_data['correct_model_2'] == True)].shape[0]
    m1_correct_m2_incorrect = merged_data[(merged_data['correct_model_1'] == True) & (merged_data['correct_model_2'] == False)].shape[0]
    m1_incorrect_m2_correct = merged_data[(merged_data['correct_model_1'] == False) & (merged_data['correct_model_2'] == True)].shape[0]
    m1_incorrect_m2_incorrect = merged_data[(merged_data['correct_model_1'] == False) & (merged_data['correct_model_2'] == False)].shape[0]
    
    # Create a contingency table
    # Rows for m1 correct and incorrect
    # Columns for m2 correct and incorrect
    contingency_table = [[m1_correct_m2_correct, m1_correct_m2_incorrect], 
                         [m1_incorrect_m2_correct, m1_incorrect_m2_incorrect]]
  
    print('McNemar\'s Test contingency table:')
    print(contingency_table)

    # McNemar's Test, non-exact, without any continuity correction 
    print('Test results, non-exact, no continuity correction:')
    print(mcnemar(contingency_table, exact=False, correction=False)) 

    # McNemar's Test, non-exact, with continuity correction
    print('Test results, non-exact, with continuity correction:')
    print(mcnemar(contingency_table, exact=False, correction=True))

    # McNemar's Test, exact, without any continuity correction
    print('Test results, exact, no continuity correction:')
    print(mcnemar(contingency_table, exact=True, correction=False))

    # ncc_result = mcnemar(contingency_table, exact=True, correction=False)
    # ncc_statistic = ncc_result.statistic
    # ncc_p_value = ncc_result.pvalue

def test_models(model_1_short_path, model_2_short_path):

    print('running for ' + model_1_short_path + ' and ' + model_2_short_path)

    # Load first model from '../../Data/Predictions/' + model_1_short_path
    model_1_data = pd.read_excel('../../Data/Predictions/' + model_1_short_path)
    print(model_1_data.head())
    # Third column is true, last is predicted, add column for 'correct'
    model_1_data['correct'] = model_1_data.iloc[:, 2] == model_1_data.iloc[:, 3]
    # Load second model from '../../Data/Predictions/' + model_2_short_path
    model_2_data = pd.read_excel('../../Data/Predictions/' + model_2_short_path)
    print(model_2_data.head())
    # Third column is true, last is predicted, add column for 'correct'
    model_2_data['correct'] = model_2_data.iloc[:, 2] == model_2_data.iloc[:, 3]
    # Merge the two datasets on 'ticker' and 'fixed_quarter_date'
    merged_data = pd.merge(model_1_data, model_2_data, on=['ticker', 'fixed_quarter_date'], suffixes=('_model_1', '_model_2'))

    # Run test
    run_test(merged_data)

## Run Tests

In [3]:
test_models('Logistic Regression/include_previous_rating_model_2/include_previous_rating_model_2_predictions.xlsx',
            'Logistic Regression/include_previous_rating_model_3/include_previous_rating_model_3_predictions.xlsx')

running for Logistic Regression/include_previous_rating_model_2/include_previous_rating_model_2_predictions.xlsx and Logistic Regression/include_previous_rating_model_3/include_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date Rating include_previous_rating_model_2_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC         2012-04-01      A                                           A
4    ABC         2013-01-01      A                                           A
  ticker fixed_quarter_date Rating include_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                     

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))


In [4]:
test_models('Logistic Regression/exclude_previous_rating_model_2/exclude_previous_rating_model_2_predictions.xlsx',
            'Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

running for Logistic Regression/exclude_previous_rating_model_2/exclude_previous_rating_model_2_predictions.xlsx and Logistic Regression/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date Rating exclude_previous_rating_model_2_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                          AA
2   ABBV         2016-04-01      A                                          AA
3    ABC         2012-04-01      A                                           A
4    ABC         2013-01-01      A                                           A
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                          AA
2   ABBV         2016-04-01      A                                     

In [6]:
test_models('Logistic Regression/smote_rating_change_model_2/smote_rating_change_model_2_predictions.xlsx',
            'Logistic Regression/smote_rating_change_model_3/smote_rating_change_model_3_predictions.xlsx')

running for Logistic Regression/smote_rating_change_model_2/smote_rating_change_model_2_predictions.xlsx and Logistic Regression/smote_rating_change_model_3/smote_rating_change_model_3_predictions.xlsx
  ticker fixed_quarter_date Change Direction Since Last Fixed Quarter Date  \
0   AAPL         2016-07-01                Same As Last Fixed Quarter Date   
1   ABBV         2015-04-01                Same As Last Fixed Quarter Date   
2   ABBV         2016-04-01                Same As Last Fixed Quarter Date   
3    ABC         2012-04-01                Same As Last Fixed Quarter Date   
4    ABC         2013-01-01                Same As Last Fixed Quarter Date   

   smote_rating_change_model_2_predictions  
0          Same As Last Fixed Quarter Date  
1  Downgrade Since Last Fixed Quarter Date  
2          Same As Last Fixed Quarter Date  
3          Same As Last Fixed Quarter Date  
4          Same As Last Fixed Quarter Date  
  ticker fixed_quarter_date Change Direction Since Last Fix

In [7]:
test_models('XGBoost/include_previous_rating_model_2/include_previous_rating_model_2_predictions.xlsx',
            'XGBoost/include_previous_rating_model_3/include_previous_rating_model_3_predictions.xlsx')

running for XGBoost/include_previous_rating_model_2/include_previous_rating_model_2_predictions.xlsx and XGBoost/include_previous_rating_model_3/include_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date Rating include_previous_rating_model_2_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC         2012-04-01      A                                           A
4    ABC         2013-01-01      A                                           A
  ticker fixed_quarter_date Rating include_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC        

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))


In [8]:
test_models('XGBoost/exclude_previous_rating_model_2/exclude_previous_rating_model_2_predictions.xlsx',
            'XGBoost/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx')

running for XGBoost/exclude_previous_rating_model_2/exclude_previous_rating_model_2_predictions.xlsx and XGBoost/exclude_previous_rating_model_3/exclude_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date Rating exclude_previous_rating_model_2_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC         2012-04-01      A                                          BB
4    ABC         2013-01-01      A                                           A
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC        

In [10]:
test_models('XGBoost/smote_rating_change_model_2/smote_rating_change_model_2_predictions.xlsx',
            'XGBoost/smote_rating_change_model_3/smote_rating_change_model_3_predictions.xlsx')

running for XGBoost/smote_rating_change_model_2/smote_rating_change_model_2_predictions.xlsx and XGBoost/smote_rating_change_model_3/smote_rating_change_model_3_predictions.xlsx
  ticker fixed_quarter_date Change Direction Since Last Fixed Quarter Date  \
0   AAPL         2016-07-01                Same As Last Fixed Quarter Date   
1   ABBV         2015-04-01                Same As Last Fixed Quarter Date   
2   ABBV         2016-04-01                Same As Last Fixed Quarter Date   
3    ABC         2012-04-01                Same As Last Fixed Quarter Date   
4    ABC         2013-01-01                Same As Last Fixed Quarter Date   

  smote_rating_change_model_2_predictions  
0         Same As Last Fixed Quarter Date  
1         Same As Last Fixed Quarter Date  
2         Same As Last Fixed Quarter Date  
3         Same As Last Fixed Quarter Date  
4         Same As Last Fixed Quarter Date  
  ticker fixed_quarter_date Change Direction Since Last Fixed Quarter Date  \
0   AAPL   

### Graph NN Tests

In [11]:
# Reformatted GNN data

# Example
inductive_gnn_test = pd.read_excel('../../Data/Predictions/Graph Neural Network/Inductive/exclude_previous_rating_model_2_predictions.xlsx')
print(inductive_gnn_test.head())

def reformat_gnn_data(short_path):
    # Load GNN data
    gnn_df = pd.read_excel('../../Data/Predictions/Graph Neural Network/' + short_path)
    # Split node on ' : ' into 'ticker' and 'fixed_quarter_date'
    gnn_df[['ticker', 'fixed_quarter_date']] = gnn_df['node'].str.split(' : ', expand=True)
    # Keep only 'ticker', 'fixed_quarter_date', 'target', pred
    gnn_df = gnn_df[['ticker', 'fixed_quarter_date', 'target', 'pred']]
    # Save to new file
    # New path - replace / with /reformatted_
    new_path = short_path.replace('/', '/reformatted_')
    gnn_df.to_excel('../../Data/Predictions/Graph Neural Network/' + new_path, index=False)

reformat_gnn_data('Transductive/exclude_previous_rating_model_2_predictions.xlsx')
reformat_gnn_data('Transductive/exclude_previous_rating_model_3_predictions.xlsx')
reformat_gnn_data('Inductive/exclude_previous_rating_model_2_predictions.xlsx')
reformat_gnn_data('Inductive/exclude_previous_rating_model_3_predictions.xlsx')

  target pred               node
0     AA   AA  AAPL : 2016-07-01
1      A    A  ABBV : 2015-04-01
2      A    A  ABBV : 2016-04-01
3      A  BBB   ABC : 2013-04-01
4      A    A   ABC : 2013-07-01


#### Internal GNN Comparisons

In [12]:
# Test models
test_models('Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_2_predictions.xlsx',
            'Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx')

running for Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_2_predictions.xlsx and Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
model 1 accuracy: 0.6184538653366584
model 2 accuracy: 0.6209476309226932
McNemar's Test contingency table:
[[215, 33], [34, 119]]
Test results, non-exact, no continuity correction:
pvalue      0.9027648250246224
statistic   0.014925373134328358
Test results, non-exact, with continuity correction:
pvalue      1.0

In [13]:
# Test models
test_models('Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_2_predictions.xlsx',
            'Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx')

running for Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_2_predictions.xlsx and Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A  BBB
model 1 accuracy: 0.6234413965087282
model 2 accuracy: 0.6084788029925187
McNemar's Test contingency table:
[[222, 28], [22, 129]]
Test results, non-exact, no continuity correction:
pvalue      0.3961439091520741
statistic   0.72
Test results, non-exact, with continuity correction:
pvalue      0.47950012218695337
stati

#### GNN and Other Classifiers

In [14]:
test_models('Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/lr_retrain_predictions.xlsx')

running for Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/lr_retrain_predictions.xlsx


  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A  BBB
  ticker fixed_quarter_date Rating prediction
0   AAPL         2016-07-01     AA         AA
1   ABBV         2015-04-01      A         AA
2   ABBV         2016-04-01      A          A
3    ABC         2013-04-01      A          A
4    ABC         2013-07-01      A          A
model 1 accuracy: 0.6084788029925187
model 2 accuracy: 0.6234413965087282
McNemar's Test contingency table:
[[195, 49], [55, 102]]
Test results, non-exact, no continuity correction:
pvalue      0.5562984612747348
statistic   0.34615384615384615
Test results, non-exact, with continuity correction:
pvalue      0.6239284632085247
statistic   0.2403846153846154
Test results, exact, no continuity correction:
pvalue      0.624143515844679
statistic   49.0


In [15]:
test_models('Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/xgb_retrain_predictions.xlsx')

running for Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/xgb_retrain_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A  BBB
  ticker fixed_quarter_date Rating prediction
0   AAPL         2016-07-01     AA         AA
1   ABBV         2015-04-01      A          A
2   ABBV         2016-04-01      A          A
3    ABC         2013-04-01      A          A
4    ABC         2013-07-01      A          A
model 1 accuracy: 0.6084788029925187
model 2 accuracy: 0.8229426433915212
McNemar's Test contingency table:
[[230, 14], [100, 57]]
Test results, non-exact, no continuity correction:
pvalue      7.971438787362077e-16
statistic   64.87719298245614
Test results, non-exact, with continuity correctio

In [16]:
test_models('Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/lr_retrain_predictions.xlsx')

running for Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/lr_retrain_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date Rating prediction
0   AAPL         2016-07-01     AA         AA
1   ABBV         2015-04-01      A         AA
2   ABBV         2016-04-01      A          A
3    ABC         2013-04-01      A          A
4    ABC         2013-07-01      A          A
model 1 accuracy: 0.6209476309226932
model 2 accuracy: 0.6234413965087282
McNemar's Test contingency table:
[[207, 42], [43, 109]]
Test results, non-exact, no continuity correction:
pvalue      0.9136266610628591
statistic   0.011764705882352941
Test results, non-exact, with continuity correct

In [17]:
test_models('Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/xgb_retrain_predictions.xlsx')

running for Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/xgb_retrain_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date Rating prediction
0   AAPL         2016-07-01     AA         AA
1   ABBV         2015-04-01      A          A
2   ABBV         2016-04-01      A          A
3    ABC         2013-04-01      A          A
4    ABC         2013-07-01      A          A
model 1 accuracy: 0.6209476309226932
model 2 accuracy: 0.8229426433915212
McNemar's Test contingency table:
[[230, 19], [100, 52]]
Test results, non-exact, no continuity correction:
pvalue      1.1256043864769986e-13
statistic   55.134453781512605
Test results, non-exact, with continuity corr

In [18]:
test_models('Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx')

running for Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A  BBB
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                          AA
2   ABBV         2016-04-01      A                                         AAA
3    ABC         2013-04-01      A                                           A
4    ABC         2013-07-01      A                                           A
model 1 accuracy: 0.6084788029925187
model 2 accuracy: 0.6683291770573566
McNemar's Test conting

In [19]:
test_models('Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx')

running for Graph Neural Network/Inductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A  BBB
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC         2013-04-01      A                                           A
4    ABC         2013-07-01      A                                           A
model 1 accuracy: 0.6084788029925187
model 2 accuracy: 0.8927680798004988
McNemar's Test contin

In [20]:
test_models('Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx')

running for Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/lr_pretrained_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                          AA
2   ABBV         2016-04-01      A                                         AAA
3    ABC         2013-04-01      A                                           A
4    ABC         2013-07-01      A                                           A
model 1 accuracy: 0.6209476309226932
model 2 accuracy: 0.6683291770573566
McNemar's Test cont

In [21]:
test_models('Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx',
            'Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx')

running for Graph Neural Network/Transductive/reformatted_exclude_previous_rating_model_3_predictions.xlsx and Graph Neural Network/Other Classifiers on GNN Data/xgb_pretrained_predictions.xlsx
  ticker fixed_quarter_date target pred
0   AAPL         2016-07-01     AA   AA
1   ABBV         2015-04-01      A    A
2   ABBV         2016-04-01      A    A
3    ABC         2013-04-01      A  BBB
4    ABC         2013-07-01      A    A
  ticker fixed_quarter_date Rating exclude_previous_rating_model_3_predictions
0   AAPL         2016-07-01     AA                                          AA
1   ABBV         2015-04-01      A                                           A
2   ABBV         2016-04-01      A                                           A
3    ABC         2013-04-01      A                                           A
4    ABC         2013-07-01      A                                           A
model 1 accuracy: 0.6209476309226932
model 2 accuracy: 0.8927680798004988
McNemar's Test con