In [1]:
from sklearn import svm

import sys
sys.path.append('../src')

from SVM_functions import param_selection, train_and_evaluate_model, combine_features
from misc import save_dict, get_file_names, open_dict

from settings import settings

#### Features to test 

In [3]:
featureSettings = {
        'original features':
        {
            'features': ['CL', 'OrigDP', 'OrigNE', 'OrigSS', 'OrigTP', 'OrigWN', 'OrigCN'],
            'OrigOnly': True,
            'performFeatureSelection':False,

        },

        'index features':
        {
            'features': ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CN'],
            'OrigOnly': False,
            'performFeatureSelection':False,

        },

        'CoreNLP':
        {
            'features': ['CL', 'OrigDP_coreNLP', 'OrigNE_coreNLP', 'OrigSS', 'OrigTP_coreNLP', 'OrigWN', 'OrigCN'],
            'OrigOnly':True,
            'performFeatureSelection':False,

        },
        
        'additional features':
        {
            'features': ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CN', 'CD', 'QU', 'CP'],
            'OrigOnly': False,
            'performFeatureSelection':True,
        },

        'additional features':
        {
            'features': ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CD', 'QU', 'CP'],
            'OrigOnly': False,
            'performFeatureSelection':True,
        },
        'additional features plus CN':
        {
            'features': ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CD', 'CN', 'QU', 'CP'],
            'OrigOnly': False,
            'performFeatureSelection':True,
        },
    }


#### Fix params

In [None]:
for expName, expSettings in settings.items():



    expSettings['results'] = {}
    expSettings['parameters'] = {}

    for featureExpName, featureExp in featureSettings.items():

        if expSettings['OrigOnly'] and not featureExp['OrigOnly']:
            continue

        featureNames = featureExp['features'].copy()
        featureNames.append(expSettings['animacy labels dir extention'])
        featureNames.append(expSettings['character labels dir extention'])        

        featuresDir = expSettings['featuresDir']
        fileNames = get_file_names(featuresDir + featureNames[0] + '/', '.npy')

        featuresAll = combine_features(featuresDir, featureNames, fileNames).transpose()

        X = featuresAll[:, :-1]
        y = featuresAll[:, -1]

        # # select best model parameters
        # param_grid = [
        # {'C': [1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02], 'gamma': [1.e-02, 1.e-01, 1.e+00,  1.e+01, 1.e+02], 'kernel': ['rbf']},
        # ]
        # params, result, _ = param_selection(X, y, param_grid)

        # # get model, train and evaluate model, print results
        # model = svm.SVC(kernel = params['kernel'], C = params['C'], gamma = params['gamma'])
        model = svm.SVC(kernel = 'rbf', C = 0.5, gamma = 1)

        results = train_and_evaluate_model(model, X, y)

        print(expName,',',featureExpName)
        for key, value in results.items():
            print(key, value['avg'])
        print()

        # add results to experiments dict
        expSettings['results'][featureExpName] = results
        # expSettings['parameters'][featureExpName] = params
        expSettings['parameters'][featureExpName] = {'kernel': 'rbf', 'C':0.5, 'gamma': 1}



# save results
save_dict(settings, '../results/ablation_testing/settings_2.p')


#### Select params for each experiment & smapling method

In [4]:
for expName, expSettings in settings.items():

    expSettings['results'] = {}
    expSettings['parameters'] = {
        'normal': {},
        'over':{},
        'over_under':{}
        }

    for featureExpName, featureExp in featureSettings.items():

        if expSettings['OrigOnly'] and not featureExp['OrigOnly']:
            continue

        featureNames = featureExp['features'].copy()
        featureNames.append(expSettings['animacy labels dir extention'])
        featureNames.append(expSettings['character labels dir extention'])        

        featuresDir = expSettings['featuresDir']
        fileNames = get_file_names(featuresDir + featureNames[0] + '/', '.npy')

        featuresAll = combine_features(featuresDir, featureNames, fileNames).transpose()

        X = featuresAll[:, :-1]
        y = featuresAll[:, -1]

        # select best model parameters
        # param_grid = [
        # {'C': [0.001,1.e-02, 1.e-01, 1.e+00, 0.5, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05], 'gamma': [1.e-02, 1.e-01, 1.e+00,  1.e+01, 1.e+02], 'kernel': ['rbf']},
        # ]
        params, result, _ = param_selection(X, y, setting = 'normal')
        params_over, result, _ = param_selection(X, y, setting = 'over')
        params_over_under, result, _ = param_selection(X, y, setting = 'over_under')

        # get model, train and evaluate model, print results
        model = svm.SVC(kernel = params['model__kernel'], C = params['model__C'], gamma = params['model__gamma'])
        model_over = svm.SVC(kernel = params_over['model__kernel'], C = params_over['model__C'], gamma = params_over['model__gamma'])
        model_over_under = svm.SVC(kernel = params_over_under['model__kernel'], C = params_over_under['model__C'], gamma = params_over_under['model__gamma'])

        results = train_and_evaluate_model(model, X, y, setting = 'normal')
        results_over = train_and_evaluate_model(model_over, X, y, setting = 'over')
        results_over_under = train_and_evaluate_model(model_over_under, X, y, setting = 'over_under')

        resultsAll = {
            'normal':results,
            'over':results_over,
            'over_under':results_over_under
        }

        print(expName,',',featureExpName)
        for key, value in resultsAll.items():
            print(key, value['avg'])
        print()

        # add results to experiments dict
        expSettings['results'][featureExpName] = results

        expSettings['parameters'][featureExpName]= {'normal': params}
        expSettings['parameters'][featureExpName]['over'] = params_over
        expSettings['parameters'][featureExpName]['over_under'] = params_over_under



    # save results
    save_dict(settings, '../results/ablation_testing/settings_check_params_each_time' + expName + '.p')

ProppLearner_from_gold , original features
normal 0.82
over 0.83
over_under 0.83

ProppLearner_from_gold , index features
normal 0.81
over 0.85
over_under 0.84

ProppLearner_from_gold , CoreNLP
normal 0.78
over 0.8
over_under 0.8

ProppLearner_from_gold , additional features
normal 0.84
over 0.85
over_under 0.84

ProppLearner_from_allen , original features
normal 0.72
over 0.73
over_under 0.72

ProppLearner_from_allen , index features
normal 0.71
over 0.72
over_under 0.72

ProppLearner_from_allen , CoreNLP
normal 0.71
over 0.73
over_under 0.72

ProppLearner_from_allen , additional features
normal 0.71
over 0.72
over_under 0.72

ProppLearner_from_heads_only , original features
normal 0.8
over 0.8
over_under 0.79

ProppLearner_from_heads_only , CoreNLP
normal 0.8
over 0.8
over_under 0.8

LitBank_from_gold , original features
normal 0.28
over 0.41
over_under 0.42

LitBank_from_gold , index features
normal 0.34
over 0.43
over_under 0.43

LitBank_from_gold , CoreNLP
normal 0.18
over 0.41
ov

#### Get params from calculation previously

In [5]:
paramSettings = open_dict('../results/ablation_testing/settings_check_params_each_timeProppLearner_from_allen_short.p')


In [11]:
for expName, expSettings in settings.items():

    print('starting',expName)

    expSettings['results'] = {}
    expSettings['parameters'] = {
        'normal': {},
        'over':{},
        'over_under':{}
        }

    for featureExpName, featureExp in featureSettings.items():

        if expSettings['OrigOnly'] and not featureExp['OrigOnly']:
            continue

        featureNames = featureExp['features'].copy()
        featureNames.append(expSettings['animacy labels dir extention'])
        featureNames.append(expSettings['character labels dir extention'])        

        featuresDir = expSettings['featuresDir']
        fileNames = get_file_names(featuresDir + featureNames[0] + '/', '.npy')

        featuresAll = combine_features(featuresDir, featureNames, fileNames).transpose()

        X = featuresAll[:, :-1]
        y = featuresAll[:, -1]

        # select best model parameters
        # param_grid = [
        # {'C': [0.001,1.e-02, 1.e-01, 1.e+00, 0.5, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05], 'gamma': [1.e-02, 1.e-01, 1.e+00,  1.e+01, 1.e+02], 'kernel': ['rbf']},
        # ]

        if featureExpName == 'additional features plus CN':
            params = paramSettings[expName]['parameters']['additional features']['normal']
            params_over = paramSettings[expName]['parameters']['additional features']['over']
            params_over_under = paramSettings[expName]['parameters']['additional features']['over_under']

        else:
            params = paramSettings[expName]['parameters'][featureExpName]['normal']
            params_over = paramSettings[expName]['parameters'][featureExpName]['over']
            params_over_under = paramSettings[expName]['parameters'][featureExpName]['over_under']

        # get model, train and evaluate model, print results
        model = svm.SVC(kernel = params['model__kernel'], C = params['model__C'], gamma = params['model__gamma'])
        model_over = svm.SVC(kernel = params_over['model__kernel'], C = params_over['model__C'], gamma = params_over['model__gamma'])
        model_over_under = svm.SVC(kernel = params_over_under['model__kernel'], C = params_over_under['model__C'], gamma = params_over_under['model__gamma'])

        results = train_and_evaluate_model(model, X, y, setting = 'normal')
        results_over = train_and_evaluate_model(model_over, X, y, setting = 'over')
        results_over_under = train_and_evaluate_model(model_over_under, X, y, setting = 'over_under')

        resultsAll = {
            'normal':results,
            'over':results_over,
            'over_under':results_over_under
        }

        # print(expName,',',featureExpName)
        # for key, value in resultsAll.items():
        #     print(key, value['avg'])
        # print()

        # add results to experiments dict
        expSettings['results'][featureExpName] = resultsAll



    # save results
    save_dict(settings, '../results/ablation_testing/settings_params_from_chosen_each_time'+ expName + '.p')

starting ProppLearner_from_gold
starting ProppLearner_from_allen
starting ProppLearner_from_heads_only
starting LitBank_from_gold
starting LitBank_from_allen
starting CEN_from_allen
starting CEN_from_heads_only
starting ProppLearner_from_allen_short


In [10]:
results

{'accuracy': {'avg': 0.84,
  'all': array([0.83700441, 0.86343612, 0.82819383, 0.85903084, 0.84581498,
         0.81938326, 0.83185841, 0.82743363, 0.82743363, 0.85840708,
         0.84581498, 0.83259912, 0.84581498, 0.84140969, 0.88546256,
         0.85462555, 0.7920354 , 0.92035398, 0.81858407, 0.85840708,
         0.81938326, 0.83700441, 0.86343612, 0.81057269, 0.82378855,
         0.86784141, 0.8539823 , 0.88053097, 0.7920354 , 0.82743363,
         0.82819383, 0.85903084, 0.82819383, 0.83700441, 0.83700441,
         0.85462555, 0.83628319, 0.7920354 , 0.80530973, 0.87610619,
         0.83259912, 0.85462555, 0.81497797, 0.80176211, 0.77092511,
         0.82378855, 0.88495575, 0.88495575, 0.84513274, 0.86283186,
         0.85903084, 0.80176211, 0.81938326, 0.83259912, 0.85903084,
         0.85462555, 0.84070796, 0.84513274, 0.87610619, 0.84955752,
         0.80176211, 0.81938326, 0.81057269, 0.8722467 , 0.88105727,
         0.83259912, 0.86283186, 0.82743363, 0.86725664, 0.88938053,


#### first 15 stories onlt: ProppLearner from Gold

In [4]:
for tryNum in range(10):

    for expName, expSettings in settings.items():

        if expName != 'ProppLearner_from_allen_short':
            continue

        expSettings['results'] = {}
        expSettings['parameters'] = {}

        for featureExpName, featureExp in featureSettings.items():

            if expSettings['OrigOnly'] and not featureExp['OrigOnly']:
                continue

            featureNames = featureExp['features'].copy()
            featureNames.append(expSettings['animacy labels dir extention'])
            featureNames.append(expSettings['character labels dir extention'])        

            featuresDir = expSettings['featuresDir']
            fileNamesDir = expSettings['fileNamesDir']
            fileNames = get_file_names(fileNamesDir, '.p')

            featuresAll = combine_features(featuresDir, featureNames, fileNames).transpose()

            X = featuresAll[:, :-1]
            y = featuresAll[:, -1]

            # select best model parameters
            param_grid = [
            {'C': [0.001,1.e-02, 1.e-01, 1.e+00, 0.5, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05], 'gamma': [1.e-02, 1.e-01, 1.e+00,  1.e+01, 1.e+02], 'kernel': ['rbf']},
            ]
            params, result, _ = param_selection(X, y, param_grid)

            # get model, train and evaluate model, print results
            model = svm.SVC(kernel = params['model__kernel'], C = params['model__C'], gamma = params['model__gamma'])

            results = train_and_evaluate_model(model, X, y)

            print(expName,',',featureExpName)
            for key, value in results.items():
                print(key, value['avg'])
                print(params)
            print()

            # add results to experiments dict
            expSettings['results'][featureExpName] = results
            expSettings['parameters'][featureExpName] = params


    # save results
    save_dict(settings, '../results/ablation_testing/settings_ProppLearner_allen_15_stories' + str(tryNum) + '.p')

ProppLearner_from_allen_short , original features
normal 0.75
{'model__C': 100.0, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
over 0.77
{'model__C': 100.0, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
over_under 0.76
{'model__C': 100.0, 'model__gamma': 0.01, 'model__kernel': 'rbf'}

ProppLearner_from_allen_short , index features
normal 0.77
{'model__C': 0.5, 'model__gamma': 10.0, 'model__kernel': 'rbf'}
over 0.79
{'model__C': 0.5, 'model__gamma': 10.0, 'model__kernel': 'rbf'}
over_under 0.78
{'model__C': 0.5, 'model__gamma': 10.0, 'model__kernel': 'rbf'}

ProppLearner_from_allen_short , CoreNLP
normal 0.74
{'model__C': 1.0, 'model__gamma': 0.1, 'model__kernel': 'rbf'}
over 0.79
{'model__C': 1.0, 'model__gamma': 0.1, 'model__kernel': 'rbf'}
over_under 0.79
{'model__C': 1.0, 'model__gamma': 0.1, 'model__kernel': 'rbf'}

ProppLearner_from_allen_short , additional features
normal 0.79
{'model__C': 0.5, 'model__gamma': 10.0, 'model__kernel': 'rbf'}
over 0.79
{'model__C': 0.5, 'model__gam

fileNames

#### Testing

In [5]:
keys = list(settings.keys())
featKeys = list(featureSettings.keys())

In [9]:
import numpy as np
for i, key in enumerate(keys):
    for j, featKey in enumerate(featKeys):
        
        if settings[key]['OrigOnly'] == True and featureSettings[featKey]['OrigOnly'] == False:
            continue

        expSettings = settings[key]
        featureNames = featureSettings[featKey]['features'].copy()

        featureNames.append(expSettings['animacy labels dir extention'])
        featureNames.append(expSettings['character labels dir extention'])  

        featuresDir = expSettings['featuresDir']
        fileNames = get_file_names(featuresDir + featureNames[0] + '/', '.npy')
        for fileName in fileNames:

            shapes = []
            for feature in featureNames:
                shapes.append(np.load(featuresDir + feature + '/' + fileName + '.npy').shape)

            if len(list(set(shapes))) > 1:
                print(key, featKey, fileName)
                for k, shape in enumerate(shapes):
                    print('    ',featureNames[k], shape)


'../intermediate/CEN/from_original_jahan_heads_only/'

In [None]:
for expName, expSettings in settings.items():
    if 'ProppLearner' in expName:
        continue

    if 'LitBank' in expName:
        continue

    for featureExpName, featureExp in featureSettings.items():

        if featureExpName != 'additional features':
            continue

        if expSettings['OrigOnly'] == True and featureExp['OrigOnly'] == False:
            continue

        featureNames = featureExp['features'].copy()
        featureNames.append(expSettings['animacy labels dir extention'])
        featureNames.append(expSettings['character labels dir extention'])        

        featuresDir = expSettings['featuresDir']
        fileNames = get_file_names(featuresDir + featureNames[0] + '/', '.npy')

        featuresAll = combine_features(featuresDir, featureNames).transpose()

        X = featuresAll[:, :-1]
        y = featuresAll[:, -1]

        # select best model parameters
        param_grid = [
        {'C': [1, 5, 10, 20, 30, 50, 75, 100], 'gamma': [1,2,5,7.5,10,50], 'kernel': ['rbf']},
        ]
        params, result, _ = param_selection(X, y, param_grid)

        # get model, train and evaluate model, print results

        model = svm.SVC(kernel = params['kernel'], C = params['C'], gamma = params['gamma'])

        
        results = train_and_evaluate_model(model, X, y, k = 4, n=10)


        ## feature importance


        print(expName,',',featureExpName)
        for key, value in results.items():
            print(key, value['avg'])
        print()


        # add results to experiments dict
        # expSettings['results'] = results

        # # save results to json file
        # save_dict({'expName': experiment}, experiment['results_file_path'], )

#### Testing

In [6]:
a = open_dict('..\data\ProppLearner\corefs_gold_new_format\story4.p')

In [9]:
a['clusters'][2]

{'name': 'a king who had three sons',
 'mentions': [{'position': [10, 15],
   'position_StoryWorkbench': [13, 18],
   'text': 'a king who had three sons'},
  {'position': [22, 22], 'position_StoryWorkbench': [25, 25], 'text': 'him'},
  {'position': [25, 29],
   'position_StoryWorkbench': [28, 32],
   'text': 'Father , our gracious sovereign'},
  {'position': [33, 33], 'position_StoryWorkbench': [36, 36], 'text': 'your'},
  {'position': [43, 44],
   'position_StoryWorkbench': [46, 47],
   'text': 'The father'},
  {'position': [47, 47], 'position_StoryWorkbench': [50, 50], 'text': 'his'}]}

In [5]:
a = get_file_names('../data/LitBank/char_list_gold/','.json')

In [6]:
len(a)

38