In [1]:
import csv
import numpy as np
import pandas as pd
import pickle as pkl


from collections import Counter
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.metrics import roc_auc_score

In [2]:
def get_one_letter_diff(word):
    violations = []
    letters = ["A", "D", "N"]
    for letter_1 in letters:
        for letter_2 in letters:
            if letter_1 != letter_2:
                violations.append(word.replace(letter_1, letter_2))                
    return violations   


def get_lower_bound(logprobs):
    return (int(min(logprobs))-1)    


def get_upper_bound(logprobs):
    return (int(max(logprobs))+1)


def pickle_load(path):
    with open(path, "rb") as fp:
        loaded_file = pkl.load(fp)
    return loaded_file

In [3]:
def plot_hist(logprobs: list, bins_number: int, x_label: str, y_label: str, plot_title: str):
    
    upper_bound = get_upper_bound(logprobs)
    lower_bound = get_lower_bound(logprobs)

    bins = np.linspace(lower_bound, upper_bound, bins_number)

    plt.hist(logprobs, bins, alpha=0.5, color="orange")

    plt.legend(loc='upper right', prop={'size': 9})
    plt.xlabel(x_label, fontsize = 12, fontname = "Times")
    plt.ylabel(y_label, fontsize = 12, fontname = "Times")
    plt.title(plot_title)
    plt.show()

In [84]:
LSTM_probs = pickle_load('results/LSTM_dataset2020_probs.pkl')
LSTM_logprobs = np.log(LSTM_probs)

pd.options.display.max_colwidth= 100

data = pd.read_csv("data/dataset.csv", index_col=0)
data = data.reset_index(0)
data['LSTM_logprobs'] = LSTM_logprobs
data.head()

Unnamed: 0,index,animated,cases,grammatical,positions,sentence,template,LSTM_logprobs
0,0,False,NDA,True,123,"Es steht fest, dass der Polizist dem Fahrer den Führerschein abnimmt.",0,-140.672142
1,1,False,NAD,True,123,"Es steht fest, dass der Polizist den Fahrer dem Führerschein abnimmt.",0,-140.706445
2,2,False,DNA,True,123,"Es steht fest, dass dem Polizisten der Fahrer den Führerschein abnimmt.",0,-140.685578
3,3,False,DAN,True,123,"Es steht fest, dass dem Polizisten den Fahrer der Führerschein abnimmt.",0,-140.672084
4,4,False,AND,True,123,"Es steht fest, dass den Polizisten der Fahrer dem Führerschein abnimmt.",0,-140.684289


In [14]:
data[data['index']==6025]

Unnamed: 0,index,animated,cases,grammatical,positions,sentence,template,LSTM_logprobs,1vs6
6025,6025,True,DDA,False,312,"Wir machen uns Sorgen, weil dem Assistenten dem Manager den Controller schickt.",41,-151.383515,


In [15]:
LSTM_logprobs[6025]

-151.38351535797119

In [6]:
is_gram = data['grammatical']==True
is_ungram = data['grammatical']==False
is_nom = data['cases'].isin(['NNA','NND','NAN','NDN','ANN','DNN'])
is_acc = data['cases'].isin(['AAN','AAD','ANA','ADA','NAA','DAA'])
is_dat = data['cases'].isin(['DDN','DDA','DND','DAD','NDD','ADD'])
is_NDA = data['cases']=='NDA'
is_NAD = data['cases']=='NAD'
is_AND = data['cases']=='AND'
is_ADN = data['cases']=='ADN'
is_DNA = data['cases']=='DNA'
is_DAN = data['cases']=='DAN'
is_animate = data['animated']==True
is_inanimate = data['animated']==False
is_123 = data['positions']==123
is_132 = data['positions']==132
is_213 = data['positions']==213
is_231 = data['positions']==231
is_312 = data['positions']==312
is_321 = data['positions']==321

In [49]:
for i in range(50):
    logprobs = data[data['template']==i].LSTM_logprobs 
    plot_hist(logprobs, 100, 'logprobs', 'sentences', f'LSTM predictions for template {i}')

<class 'pandas.core.series.Series'> 144


In [7]:
print('nom',round(data[is_nom]["LSTM_logprobs"].mean(),2),
      'acc',round(data[is_acc]["LSTM_logprobs"].mean(),2),
      'dat',round(data[is_dat]["LSTM_logprobs"].mean(),2))

nom -147.43 acc -147.42 dat -147.43


In [81]:
# AUC per template
aucs = []
nom_aucs = []
acc_aucs = []
dat_aucs = []

for i in range(50):

    is_temp = data["template"]==i
    gram_resp = is_temp & is_gram
    ungram_resp = is_temp & is_ungram
    nom_resp = is_temp & is_nom
    acc_resp = is_temp & is_acc
    dat_resp = is_temp & is_dat
   
    labels = [1]*len(data[gram_resp]) + [0]*len(data[ungram_resp])
    nom_labels = [1]*len(data[gram_resp]) + [0]*len(data[nom_resp])
    acc_labels = [1]*len(data[gram_resp]) + [0]*len(data[acc_resp])
    dat_labels = [1]*len(data[gram_resp]) + [0]*len(data[dat_resp])

    scores = pd.concat([data[gram_resp]["LSTM_logprobs"], data[ungram_resp]["LSTM_logprobs"]])
    nom_scores = pd.concat([data[gram_resp]["LSTM_logprobs"], data[nom_resp]["LSTM_logprobs"]])
    acc_scores = pd.concat([data[gram_resp]["LSTM_logprobs"], data[acc_resp]["LSTM_logprobs"]])
    dat_scores = pd.concat([data[gram_resp]["LSTM_logprobs"], data[dat_resp]["LSTM_logprobs"]])

    auc = roc_auc_score(labels, scores)
    nom_auc = roc_auc_score(nom_labels, nom_scores)
    acc_auc = roc_auc_score(acc_labels, acc_scores)
    dat_auc = roc_auc_score(dat_labels, dat_scores)
        
    aucs.append(auc)
    nom_aucs.append(nom_auc)
    acc_aucs.append(acc_auc)
    dat_aucs.append(dat_auc)  
    
    #print('template ',i,'AUC: %.3f' % auc)
    
print(f"auc over templates: {round(np.mean(aucs),2)}, previous:0.56")
print(f"auc nom over templates: {round(np.mean(nom_aucs),2)}, previous:0.42")
print(f"auc acc over templates: {round(np.mean(acc_aucs),2)}, previous:0.55")
print(f"dat dat over templates: {round(np.mean(dat_aucs),2)}, previous:0.71")

auc over templates: 0.48, previous:0.56
auc nom over templates: 0.58, previous:0.42
auc acc over templates: 0.37, previous:0.55
dat dat over templates: 0.49, previous:0.71


In [9]:
# 1 vs 6 analysis
for index, row in data.iterrows():
    if row.grammatical == True:
        is_templ = data['template'] == row.template
        is_pos = data['positions'] == row.positions
        violations = get_one_letter_diff(row.cases)

        scores = [row['LSTM_logprobs'],*((data[is_templ & is_pos & (data['cases'] == violation)]['LSTM_logprobs'])
                             for violation in violations)]

        labels = [1] + [0]*(len(scores)-1)
        auc = roc_auc_score(labels, scores)
        data.loc[index,'1vs6'] = auc

In [10]:
gb = data[is_gram].groupby(["cases", "positions"])["1vs6"].mean().round(2).reset_index().pivot(index="cases",
                                                                                 columns="positions",
                                                                                 values="1vs6")
gb.index.name = None
gb.columns.name = "Case\Pos"
print('AUC 1 vs 6')
gb

AUC 1 vs 6


Case\Pos,123,132,213,231,312,321
ADN,0.51,0.51,0.47,0.51,0.44,0.45
AND,0.51,0.49,0.53,0.5,0.47,0.44
DAN,0.68,0.59,0.69,0.59,0.65,0.64
DNA,0.38,0.55,0.36,0.54,0.39,0.39
NAD,0.27,0.2,0.26,0.2,0.31,0.3
NDA,0.43,0.47,0.46,0.5,0.59,0.56


In [55]:
gramdata = pd.read_csv("data/grammatical_sentences.csv", sep='.\n')
ungramdata = pd.read_csv("data/ungrammatical_sentences.csv", sep='.\n')

  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [87]:
gram_temps = []
for i in range(50):
    gram_temps.extend([i]*36)
gramdata['template'] = gram_temps
gramdata['grammatical'] = True

ungram_temps = []
for i in range(50):
    ungram_temps.extend([i]*108)
ungramdata['template'] = ungram_temps
ungramdata['grammatical'] = False

old_dataset = pd.concat([gramdata, ungramdata],ignore_index=True)

old_dataset[['sentence']].to_csv('data/dataset2019.txt', index=False, header=False,
                             escapechar="\\", sep="\t", quoting=csv.QUOTE_NONE)
old_dataset.to_csv("data/dataset2019.csv")

In [93]:
LSTM_probs_2019 = pickle_load('results/LSTM_dataset2019_probs.pkl')
LSTM_logprobs_2019 = np.log(LSTM_probs_2019)

pd.options.display.max_colwidth= 100

data2019 = pd.read_csv("data/dataset2019.csv", index_col=0)
data2019 = data2019.reset_index(0)
data2019['LSTM_logprobs'] = LSTM_logprobs_2019

In [98]:
data2019[data2019['grammatical']==False]["LSTM_logprobs"].mean()

-140.67446241837962

In [101]:
# AUC per template
is_gram2019 = data2019['grammatical']==True
is_ungram2019 = data2019['grammatical']==False

aucs = []

for i in range(50):

    is_temp = data2019["template"]==i
    gram_resp = is_temp & is_gram2019
    ungram_resp = is_temp & is_ungram2019
   
    labels = [1]*len(data[gram_resp]) + [0]*len(data[ungram_resp])

    scores = pd.concat([data[gram_resp]["LSTM_logprobs"], data[ungram_resp]["LSTM_logprobs"]])

    auc = roc_auc_score(labels, scores)
    
    aucs.append(auc)
    
    #print('template ',i,'AUC: %.3f' % auc)
    
print(f"auc 2019 over templates: {round(np.mean(aucs),2)}, previous:0.56")

auc 2019 over templates: 0.49, previous:0.56


In [103]:
gram_probs_2019 = pickle_load('results/LSTM_results/LSTM_gram_probs.pkl')
ungram_probs_2019 = pickle_load('results/LSTM_results/LSTM_ungram_probs.pkl')


In [121]:
print(len(gram_probs_2019))
print(len(ungram_probs_2019))
print(np.log(gram_probs_2019))

1800
5400
[-60.76771688 -61.35128212 -69.50529194 ..., -74.88793373 -67.95974731
 -68.56414175]


In [116]:
print(LSTM_probs_2019)


[8.0047959561474573e-62, 8.1922019290461977e-62, 7.9119296097592606e-62, 7.8161889663862215e-62, 8.0342177703969829e-62, 7.9440702285579689e-62, 7.9441232611251956e-62, 8.1602946894931217e-62, 7.7888431020191681e-62, 7.7481846216090151e-62, 8.017952449598475e-62, 7.9987902771245013e-62, 7.7799642550970428e-62, 8.1922800563444836e-62, 7.7974495001242697e-62, 7.8416413401295153e-62, 8.1982120932203745e-62, 8.2376489552483356e-62, 8.0508462295306288e-62, 8.2583756369225752e-62, 8.0188624365566949e-62, 7.8781006372505045e-62, 8.0591349316080355e-62, 8.0898750830680861e-62, 8.1645682707883552e-62, 8.3073344018961538e-62, 7.8218039185386201e-62, 7.8174488105837748e-62, 8.1697166634580323e-62, 7.9545017241613606e-62, 8.3025110054190669e-62, 8.3106466634743981e-62, 8.048412707761938e-62, 7.9962275988063216e-62, 8.2926748973727121e-62, 8.0143364634766781e-62, 7.9680162787760232e-62, 7.9311559858385009e-62, 7.7902248362134056e-62, 7.8021580530957822e-62, 7.7459533910413337e-62, 7.578080205171644