In [1]:
### This Notebook runs experiments for the full char extraction pipeline

In [2]:
from sklearn import svm
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score
from sklearn.model_selection import KFold
import re

import sys
sys.path.append('../src')

from SVM_functions import param_selection, train_and_evaluate_model, combine_features
from misc import save_dict, get_file_names, open_dict, open_list
from eval_functions import combine_corefs, get_ref_exps_from_coref_dict, get_all_variations, score_output, process_predictions


#### Results - 5 fold cross validation, repeated 20 times.
F-Score claculated based on outputted character list & ground truth characted list

In [4]:
useGoldForTraining = True

for tryNum in range(5):

    settings = open_dict('../results/ablation_testing/settings.p')

    settingsFromAllen = {
            'ProppLearner': {
                'normal':settings['ProppLearner_from_allen'],
                'goldStandard':settings['ProppLearner_from_gold']
            },
            # 'CEN': {
            #     'normal':settings['CEN_from_allen'],
            # },
            'LitBank': {
                'normal':settings['LitBank_from_allen'],
                'goldStandard':settings['LitBank_from_gold']
            },
        }

    numFolds = 5
    numRepeats = 20
    numSettings = len(settingsFromAllen)

    results = np.zeros((numSettings, 4, 4, numFolds, numRepeats))
    # test = np.object0((numSettings, numFolds, numRepeats))

    for setNum, (settingName, setting) in enumerate(settingsFromAllen.items()):
        
        # get directories, parameters & file names for the dataset from settings

        if useGoldForTraining:
            trainSetting = 'goldStandard'
        else:
            trainSetting = 'normal'
            

        featuresDirTest = setting['normal']['featuresDir']
        featuresDirTrain = setting[trainSetting]['featuresDir']
        corefsDir = setting['normal']['corefDir']
        fileNames = get_file_names(corefsDir, '.p')
        params = setting['normal']['parameters']['additional features']

        # get list of features to use
        featureNamesTest = ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CD', 'QU', 'CP']
        featureNamesTest.append(setting['normal']['animacy labels dir extention'])
        featureNamesTest.append(setting['normal']['character labels dir extention']) 

        featureNamesTrain = ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CD', 'QU', 'CP']
        featureNamesTrain.append(setting[trainSetting]['animacy labels dir extention'])
        featureNamesTrain.append(setting[trainSetting]['character labels dir extention']) 

        # turn list of file names into an array and shuffle
        fileNames = np.array(fileNames)
        fileNames = np.random.permutation(fileNames)

        # set up k fold cross validation
        kf = KFold(n_splits = numFolds)

        for repeatNum in range(numRepeats):

            for foldNum, (train_idx, test_idx) in enumerate(kf.split(fileNames)):

                # get file names for test, and file names for training
                fileNamesTrain = fileNames[train_idx]
                fileNamesTest = fileNames[test_idx]

                # get training features & labels (over and under sampling)
                featuresTrain = combine_features(featuresDirTrain, featureNamesTrain, list(fileNamesTrain)).transpose()
                X_train = featuresTrain[:, :-1]
                y_train = featuresTrain[:, -1]

                over = SMOTE(sampling_strategy = 0.8)
                under = RandomUnderSampler(sampling_strategy = 1.0)
                X_train, y_train = over.fit_resample(X_train, y_train)
                X_train, y_train = under.fit_resample(X_train, y_train)


                # train model
                model = svm.SVC(kernel = params['kernel'], C = params['C'], gamma = params['gamma'])
                model.fit(X_train, y_train)

                ### scoring works for CEN and ProppLearner
                testResults = np.zeros((len(fileNamesTest), 4, 4))

                for testFileNum, fileName in enumerate(fileNamesTest):
                    
                    # get features for test story, and get predications from model
                    featuresTest = combine_features(featuresDirTest, featureNamesTest, [fileName]).transpose()
                    X_test = featuresTest[:, :-1]
                    _ = featuresTest[:, -1]

                    y_pred = model.predict(X_test)

                    # get output coref chains
                    corefDict = open_dict(corefsDir + fileName + '.p')
                    outputChains = get_ref_exps_from_coref_dict(corefDict)

                    # post process y_pred
                    ### get rid of 'None'
                    y_pred_none = y_pred.copy()
                    for k, (pred, corefChain) in enumerate(zip(y_pred_none, outputChains)):
                    
                        if corefChain[0] == None:
                            y_pred_none[k] = 0

                        elif corefChain[0] == "'s":
                            y_pred_none[k] = 0

                    ### get rid of 'and's
                    y_pred_and = y_pred.copy()
                    for k, (pred, corefChain) in enumerate(zip(y_pred_and, outputChains)):
                    
                        if corefChain[0] != None and ' and ' in corefChain[0]:
                            y_pred_and[k] = 0

                    ### combine both
                    y_pred_both = y_pred_and * y_pred_none

                    # get ground truth character coref chains

                    
                    if settingName == 'ProppLearner':
                        fileNameChars = fileName[0].upper() + fileName[1:]
                    else:
                        fileNameChars = fileName

                    ####### PropLearner and CEN
                    if settingName in ['ProppLearner', 'CEN']:
                        charChains = open_list('../data/'+settingName+'/char_list_gold_full_chains/'+fileNameChars+'.json')
                    

                    else:
                        charNames = open_list('../data/LitBank/char_list_gold/' + fileNameChars + '.json')
                        charChains = get_all_variations(charNames)

                    # calculate prec, recall, accuracy and f1 for each version of y_pred
                    # precision, recall, accuracy, f1
                    testResults[testFileNum, 0, : ] = score_output(charChains, y_pred, outputChains)[:4]
                    testResults[testFileNum, 1, : ] = score_output(charChains, y_pred_both, outputChains)[:4]
                    testResults[testFileNum, 2, : ] = score_output(charChains, y_pred_none, outputChains)[:4]
                    testResults[testFileNum, 3, : ] = score_output(charChains, y_pred_and, outputChains)[:4]

                    # Put together human readable output (only saved for the first fold in k-fold cross validation) 
                    resultsFileText = settingName

                    if repeatNum == 0:
                        for fileName in fileNamesTest:
                            featuresTest = combine_features(featuresDirTest, featureNamesTest, [fileName]).transpose()

                            X_test = featuresTest[:, :-1]
                            y_test = featuresTest[:, -1]
                            
                            y_pred = model.predict(X_test)

                            corefDict = open_dict(corefsDir + fileName + '.p')
                            outputChains = get_ref_exps_from_coref_dict(corefDict)

                            # get y_pred_both
                            y_pred_none = y_pred.copy()
                            for k, (pred, corefChain) in enumerate(zip(y_pred_none, outputChains)):
                            
                                if corefChain[0] == None:
                                    y_pred_none[k] = 0

                                elif corefChain[0] == "'s":
                                    y_pred_none[k] = 0

                            y_pred_and = y_pred.copy()
                            for k, (pred, corefChain) in enumerate(zip(y_pred_and, outputChains)):
                            
                                if corefChain[0] != None and ' and ' in corefChain[0]:
                                    y_pred_and[k] = 0

                            y_pred_both = y_pred_and * y_pred_none

                            #
                            if settingName == 'ProppLearner':
                                fileNameChars = fileName[0].upper() + fileName[1:]
                            else:
                                fileNameChars = fileName

                            resultsFileText += '\n\n' + fileName + '\n'

                            # orig Jahan character coref chains
                            if not 'LitBank' in settingName:
                                charChains = open_list('../data/'+settingName+'/char_list_gold_full_chains/'+fileNameChars+'.json')
                                charLabels, charVotes = score_output(charChains, y_pred_both, outputChains)[-2:]
                                
                                for chainNum, chain in enumerate(charChains):
                                    resultsFileText += str(charVotes[chainNum]) + ' | ' +  ' | '.join(chain) + '\n'
                                
                            else:
                                charNames = open_list('../data/LitBank/char_list_gold/' + fileNameChars + '.json')
                                charChains = get_all_variations(charNames)
                                charLabels, charVotes = score_output(charChains, y_pred_both, outputChains)[-2:]
                                for chainNum, chain in enumerate(charChains):
                                    resultsFileText += str(charVotes[chainNum]) + ' | ' + ' | '.join(chain) + '\n'
                            

                            
                            for i, chain in enumerate(outputChains):
                                if y_pred_both[i] == 1.:
                                    # if chain[0] == None:
                                    #     continue
                                    refsPrint = str(charLabels[i]) + ' | ' +  str(chain[0]) + ' |'
                                    for j in range(1, len(chain)):
                                        refsPrint += '| ' + chain[j] + ' ' 

                                    resultsFileText += '\n' + str(y_pred_both[i]) + ' | ' + refsPrint            

                        with open('../results/full_pipeline/' + settingName + str(foldNum) + '.txt', 'w') as f:
                            f.write(resultsFileText)


                testResults = np.mean(testResults, axis = 0)
                results[setNum, :, :, foldNum, repeatNum] = testResults

    results = np.mean(results, axis = 4)
    results = np.mean(results, axis = 3)

    np.save('../results/full_pipeline/results_gold_for_labelling' + str(tryNum), results)


FileNotFoundError: [Errno 2] No such file or directory: '../intermediate/ProppLearner/from_gold_corefs/CL/story32.npy'

#### Compare results old char labelling vs new char labelling

In [14]:
def get_ave_results(fileNameBase):
    
    results = np.load('../results/full_pipeline/' + fileNameBase + '0.npy')

    for i in range(1, 4):
        results += np.load('../results/full_pipeline/' + fileNameBase + str(i) + '.npy')

    return results / 5

In [16]:
print('new labelling', get_ave_results('results_new_char_labelling'))
print()
print('old labelling', get_ave_results('results'))


new labelling [[[0.41242984 0.54006123 0.41242984 0.45390619]
  [0.46155634 0.53483573 0.46155634 0.48155781]
  [0.43661723 0.54006123 0.43661723 0.46890208]
  [0.43504659 0.53483573 0.43504659 0.4657462 ]]

 [[0.41420114 0.38324808 0.41420114 0.37722727]
  [0.44467015 0.37360103 0.44467015 0.38652053]
  [0.41853352 0.38324808 0.41853352 0.37981575]
  [0.43947051 0.37360103 0.43947051 0.38356567]]]

old labelling [[[0.41377981 0.54160233 0.41377981 0.45512876]
  [0.46269896 0.53665517 0.46269896 0.48259645]
  [0.4377362  0.54160233 0.4377362  0.46994896]
  [0.43617776 0.53665517 0.43617776 0.46692263]]

 [[0.4051202  0.38023847 0.4051202  0.37147997]
  [0.43613201 0.37099011 0.43613201 0.38161959]
  [0.40963106 0.38023847 0.40963106 0.37417137]
  [0.43075687 0.37099011 0.43075687 0.37856104]]

 [[0.20250394 0.16658848 0.20250394 0.1516068 ]
  [0.22873628 0.16658848 0.22873628 0.15922965]
  [0.2114833  0.16658848 0.2114833  0.15282152]
  [0.21974263 0.16658848 0.21974263 0.15799996]]]
