In [1]:
import pickle
import pandas as pd
pd.options.display.max_colwidth = 500

def format_feature_names(x):
    ll=[]
    param_grid=x['best_params']
    for i in x['features']:
        #print(i)
        if i['name'] == 'ngram':
            pg_name=''.join(('features__', i['comb_name'], '__feature_extraction__ngram_range'))
            ngram_range=param_grid[pg_name]
            ll.append(' '.join((i['name'], ' words', str(tuple(ngram_range)))))
        elif i['name'] == 'type_dependency':
            pg_name=''.join(('features__', i['comb_name'], '__feature_extraction__ngram_range'))
            ngram_range_td=param_grid[pg_name]
            ll.append(' '.join(('ngram typed dependency', str(tuple(ngram_range_td)))))
        elif i['name']=='bert_doc':
            ll.append(''.join(('bert document emb.')))
        else:
            ll.append(i['name'])
    return ' + '.join(ll)

def format_feature_dimensions(x, name='dimension'):
    dimensions=[]
    
    feature_dimensions=x['feature_dimensions']
    features=x['features']
    for f in features:
        dim_name= f['name'] if f['name'] != 'bert_doc' else '_bert_doc'
        feature_dim=feature_dimensions[dim_name]
        if name in feature_dim.keys():
            dimensions.append(feature_dim[name])
    return dimensions

def format_param_grid_k(x, name='param_grid'):
    k=[]
    
    param_grid=x[name]
    features=x['features']
    for f in features:
        key='__'.join(('features', f['comb_name'], 'feature_selection__k'))
        if key in param_grid.keys():
            k.append(param_grid[key])
    return k

def format_param_grid_sentiment(x, name='param_grid'):
    k=[]
    
    param_grid=x[name]
    features=x['features']
    for f in features:
        if f['name'] == 'sentiment':
            key='__'.join(('features', f['comb_name'], 'feature_extraction__score_names'))
            if key in param_grid.keys():
                k.append(param_grid[key])
    return k


def get_param_grid_k_table(file_name):
    df=[]
    with open(file_name, 'rb') as f:
        df = pickle.load(f)

    df['features_name']=[format_feature_names(row) for index, row in df.iterrows()]
    df['dimension']=[format_feature_dimensions(row, 'dimension') for index, row in df.iterrows()]
    df['reduced_dimension']=[format_feature_dimensions(row, 'reduced') for index, row in df.iterrows()]
    df['param_grid_k']=[format_param_grid_k(row, 'param_grid') for index, row in df.iterrows()]
    df['best_params_k']=[format_param_grid_k(row, 'best_params') for index, row in df.iterrows()]

    return df[df['features_name'] != 'sentiment'][['train_domain', 'test_domain', 
        'model_name', 'features_name', 'dimension', 'reduced_dimension', 
        'param_grid_k', 'macro avg f1-score']]

### Table 9 : The reduction of the size of the feature space when the Logit model trained on the BHOCS data. 

In [2]:
file_name='../experiments/1_grid_search_k/results_select_k_bhocs.pkl'
get_param_grid_k_table(file_name)

Unnamed: 0,train_domain,test_domain,model_name,features_name,dimension,reduced_dimension,param_grid_k,macro avg f1-score
1,BHOCS,BHOCS,logistic_regression,"ngram words (1, 1)",[3429],[2500],"[[500, 1000, 2500]]",0.73065
2,BHOCS,BHOCS,logistic_regression,"ngram typed dependency (1, 1)",[17601],[13000],"[[1000, 5000, 13000]]",0.686027
3,BHOCS,BHOCS,logistic_regression,"ngram words (1, 4)",[33655],[25000],"[[5000, 10000, 25000]]",0.738413
4,BHOCS,BHOCS,logistic_regression,"ngram typed dependency (1, 4)",[77613],[50000],"[[5000, 25000, 50000]]",0.683382
5,BHOCS,BHOCS,logistic_regression,bert document emb.,[768],[600],"[[100, 350, 600]]",0.711299


In [4]:
file_name='../experiments/1_grid_search_k/results_select_k_bho_bhom.pkl'
get_param_grid_k_table(file_name)

Unnamed: 0,train_domain,test_domain,model_name,features_name,dimension,reduced_dimension,param_grid_k,macro avg f1-score
0,BHO,BHO,logistic_regression,"ngram words (1, 1)",[1974],[1500],"[[500, 1000, 1500]]",0.836748
1,BHO,BHO,logistic_regression,"ngram typed dependency (1, 1)",[7612],[5000],"[[1000, 2500, 5000]]",0.779743
2,BHO,BHO,logistic_regression,"ngram words (1, 4)",[15486],[10000],"[[5000, 7500, 10000]]",0.833079
3,BHO,BHO,logistic_regression,"ngram typed dependency (1, 4)",[31425],[25000],"[[5000, 15000, 25000]]",0.775962
4,BHO,BHO,logistic_regression,bert document emb.,[768],[350],"[[100, 350, 600]]",0.78362


In [5]:
file_name='../experiments/1_grid_search_k/results_select_k_c.pkl'
get_param_grid_k_table(file_name)

Unnamed: 0,train_domain,test_domain,model_name,features_name,dimension,reduced_dimension,param_grid_k,macro avg f1-score
0,C,C,logistic_regression,"ngram words (1, 1)",[2001],[500],"[[500, 1000, 1500]]",0.743243
1,C,C,logistic_regression,"ngram typed dependency (1, 1)",[7814],[1000],"[[1000, 2500, 5000]]",0.584168
2,C,C,logistic_regression,"ngram words (1, 4)",[14367],[7500],"[[5000, 7500, 10000]]",0.707973
3,C,C,logistic_regression,"ngram typed dependency (1, 4)",[31130],[23000],"[[5000, 15000, 23000]]",0.609799
4,C,C,logistic_regression,bert document emb.,[768],[600],"[[100, 350, 600]]",0.687415


In [6]:
file_name='../experiments/1_grid_search_k/results_select_k_cm.pkl'
get_param_grid_k_table(file_name)

Unnamed: 0,train_domain,test_domain,model_name,features_name,dimension,reduced_dimension,param_grid_k,macro avg f1-score
0,CM,CM,logistic_regression,"ngram words (1, 1)",[1634],[500],"[[500, 1000, 1300]]",0.85178
1,CM,CM,logistic_regression,"ngram typed dependency (1, 1)",[6550],[1000],"[[1000, 2500, 5000]]",0.601918
2,CM,CM,logistic_regression,"ngram words (1, 4)",[12500],[5000],"[[5000, 7500, 10000]]",0.816359
3,CM,CM,logistic_regression,"ngram typed dependency (1, 4)",[27071],[5000],"[[5000, 15000, 20000]]",0.616213
4,CM,CM,logistic_regression,bert document emb.,[768],[350],"[[100, 350, 600]]",0.756568


In [7]:
file_name='../experiments/1_grid_search_k/results_select_k_bhocsm.pkl'
get_param_grid_k_table(file_name)

Unnamed: 0,train_domain,test_domain,model_name,features_name,dimension,reduced_dimension,param_grid_k,macro avg f1-score
1,BHOCSM,BHOCSM,logistic_regression,"ngram words (1, 1)",[2911],[500],"[[500, 1000, 2500]]",0.810764
2,BHOCSM,BHOCSM,logistic_regression,"ngram typed dependency (1, 1)",[15353],[1000],"[[1000, 5000, 12000]]",0.676786
3,BHOCSM,BHOCSM,logistic_regression,"ngram words (1, 4)",[30051],[5000],"[[5000, 10000, 22000]]",0.805776
4,BHOCSM,BHOCSM,logistic_regression,"ngram typed dependency (1, 4)",[67571],[5000],"[[5000, 25000, 50000]]",0.693869
5,BHOCSM,BHOCSM,logistic_regression,bert document emb.,[768],[600],"[[100, 350, 600]]",0.745699
