##Setup environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone -b legacy https://github.com/OpenNMT/OpenNMT-py
!pip install OpenNMT-py
import os
outdir = 'drive/MyDrive/Rule_Learning_Thesis/output'
if not os.path.exists(outdir):
    os.makedirs(outdir)

!pip install scipy
!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

Cloning into 'OpenNMT-py'...
remote: Enumerating objects: 17355, done.[K
remote: Counting objects: 100% (311/311), done.[K
remote: Compressing objects: 100% (218/218), done.[K
remote: Total 17355 (delta 189), reused 156 (delta 90), pack-reused 17044[K
Receiving objects: 100% (17355/17355), 273.55 MiB | 36.68 MiB/s, done.
Resolving deltas: 100% (12489/12489), done.
Collecting OpenNMT-py
[?25l  Downloading https://files.pythonhosted.org/packages/e9/23/c565e03ddffb57db1b79bd9a97c8f56895eea094d9314ba5b12ce1282593/OpenNMT_py-2.1.2-py3-none-any.whl (212kB)
[K     |████████████████████████████████| 215kB 18.6MB/s 
[?25hCollecting torch==1.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/5d/5e/35140615fc1f925023f489e71086a9ecc188053d263d3594237281284d82/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl (748.8MB)
[K     |████████████████████████████████| 748.8MB 23kB/s 
[?25hCollecting pyonmttok<2,>=1.23; platform_system == "Linux" or platform_system == "Darwin"
[?25l  Downl

#German

## Generate Datasets

In [None]:
!cd drive/MyDrive/Rule_Learning_Thesis/German; python generateGermanRNNData.py

Token freq data type count: 1486


##Parameter Setup

In [None]:
# RNN Settup
epochs, n_examples, batchsize,  = 100, 4000, 20
steps = str(int(epochs * n_examples / batchsize))

data_type = 'celex'
# data_type = 'celex_token_freq'

datadir = f'drive/MyDrive/Rule_Learning_Thesis/German/{data_type}'

rnn_modelpath = f'{outdir}/german_rnn_model_{data_type}'
rnn_train_args = ' '.join([
	f'-data {datadir}/processed',
	'-save_model '+rnn_modelpath,
	'-enc_layers 2',
	'-dec_layers 2',
	'-rnn_size 100',
	'-batch_size 20',
	'-word_vec_size 300',
	'-gpu_ranks 0',
	'-train_steps '+steps,
	'-save_checkpoint_steps '+steps
	])

rnn_trans_args = ' '.join([
	 '-model '+rnn_modelpath+'_step_'+steps+'.pt',
	 f'-src {datadir}/german-src-test.txt',
	 f'-output {outdir}/german-rnn-{data_type}-pred.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

rnn_trans_args_wug = ' '.join([
	 '-model '+rnn_modelpath+'_step_'+steps+'.pt',
	 f'-src {datadir}/german-wug.txt',
	 f'-output {outdir}/german-rnn-{data_type}-pred-wug.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

rnn_trans_args_wug_gendered = ' '.join([
	 '-model '+rnn_modelpath+'_step_'+steps+'.pt',
	 f'-src {datadir}/german-wug-gendered.txt',
	 f'-output {outdir}/german-rnn-{data_type}-pred-wug-gendered.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

# Transformer Setup
transformer_modelpath = f'{outdir}/german_transformer_model_{data_type}'
steps = 20000
transformer_train_args = ' '.join(f'''-data {datadir}/processed -gpu_ranks 0
-save_model {transformer_modelpath} -layers 2 -rnn_size 312
-word_vec_size 312 -transformer_ff 512 -heads 8 
-encoder_type transformer -decoder_type transformer -position_encoding 
-train_steps {steps} -max_generator_batches 2 -dropout 0.1 
-batch_size 20 -batch_type tokens -normalization tokens 
-accum_count 2 -optim adam -adam_beta2 0.998 
-decay_method noam -warmup_steps 1000 -learning_rate 0.05 -max_grad_norm 0 
-param_init 0 -param_init_glorot -label_smoothing 0.1 
-valid_steps 1000 -save_checkpoint_steps 5000 -world_size 1'''.split())

transformer_trans_args = ' '.join([
	 '-model '+transformer_modelpath+f'_step_{steps}.pt',
	 f'-src {datadir}/german-src-test.txt',
	 f'-output {outdir}/german-transformer-{data_type}-pred.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

transformer_trans_args_wug = ' '.join([
	 '-model '+transformer_modelpath+f'_step_{steps}.pt',
	 f'-src {datadir}/german-wug.txt',
	 f'-output {outdir}/german-transformer-{data_type}-pred-wug.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

transformer_trans_args_wug_gendered = ' '.join([
	 '-model '+transformer_modelpath+f'_step_{steps}.pt',
	 f'-src {datadir}/german-wug-gendered.txt',
	 f'-output {outdir}/german-transformer-{data_type}-pred-wug-gendered.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

##Preprocess German data

In [None]:
!python OpenNMT-py/preprocess.py -train_src $datadir/german-src-train.txt -train_tgt $datadir/german-tgt-train.txt -valid_src $datadir/german-src-val.txt -valid_tgt $datadir/german-tgt-val.txt -save_data $datadir/processed


[2021-02-02 23:39:21,960 INFO] Extracting features...
[2021-02-02 23:39:21,962 INFO]  * number of source features: 0.
[2021-02-02 23:39:21,962 INFO]  * number of target features: 0.
[2021-02-02 23:39:21,962 INFO] Building `Fields` object...
[2021-02-02 23:39:21,962 INFO] Building & saving training data...
[2021-02-02 23:39:21,976 INFO] Building shard 0.
[2021-02-02 23:39:22,081 INFO]  * saving 0th train data shard to drive/MyDrive/Rule_Learning_Thesis/German/celex/processed.train.0.pt.
[2021-02-02 23:39:22,275 INFO]  * tgt vocab size: 41.
[2021-02-02 23:39:22,275 INFO]  * src vocab size: 43.
[2021-02-02 23:39:22,281 INFO] Building & saving validation data...
[2021-02-02 23:39:22,294 INFO] Building shard 0.
[2021-02-02 23:39:22,302 INFO]  * saving 0th valid data shard to drive/MyDrive/Rule_Learning_Thesis/German/celex/processed.valid.0.pt.


##German RNN

In [None]:
!python OpenNMT-py/train.py $rnn_train_args

[2021-02-02 23:39:23,242 INFO]  * src vocab size = 43
[2021-02-02 23:39:23,242 INFO]  * tgt vocab size = 41
[2021-02-02 23:39:23,242 INFO] Building model...
[2021-02-02 23:39:27,402 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(43, 300, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(300, 100, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(41, 300, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(400, 100)
        (1): LSTMCell(100, 100)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=100, out_features=100, bias=False)
      (linear_out)

In [None]:
!python OpenNMT-py/translate.py $rnn_trans_args
!python OpenNMT-py/translate.py $rnn_trans_args_wug
!python OpenNMT-py/translate.py $rnn_trans_args_wug_gendered

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED SCORE: -2.2013

[2021-02-02 23:48:47,434 INFO] 
SENT 2: ['b', 'r', 'i', ':', 'g', 'a', 'd', 'i', ':', 'e', ':']
PRED 2: a d i : g a d i : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e : e
PRED SCORE: -0.1683

[2021-02-02 23:48:47,434 INFO] 
SENT 3: ['t', 'O', 'n', 'a', ':', 'Z', '@']
PRED 3: t O n a n a : Z @ n
PRED SCORE: -0.2810

[2021-02-02 23:48:47,435 INFO] 
SENT 4: ['k', 'r', 'O', 'y', 't', 's', 't', 's', 'u', ':', 'k']
PRED 4: s t s O y t r O y t s t s u : k k r O : g @ k r
PRED SCORE: -2.5384

[2021-02-02 23:48:47,435 INFO] 
SENT 5: ['t', 'o', ':', 'r']
PRED 5: t o : t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t o : r t
PRED SCORE: -0.0019

[

##German Transformer

In [None]:
!python OpenNMT-py/train.py $transformer_train_args

[2021-02-02 23:49:05,406 INFO]  * src vocab size = 43
[2021-02-02 23:49:05,406 INFO]  * tgt vocab size = 41
[2021-02-02 23:49:05,406 INFO] Building model...
[2021-02-02 23:49:09,744 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(43, 312, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=312, out_features=312, bias=True)
          (linear_values): Linear(in_features=312, out_features=312, bias=True)
          (linear_query): Linear(in_features=312, out_features=312, bias=True)
          (softmax): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
          (final_linear): Linear(in_features=312, out_features=312,

In [None]:
!python OpenNMT-py/translate.py $transformer_trans_args
!python OpenNMT-py/translate.py $transformer_trans_args_wug
!python OpenNMT-py/translate.py $transformer_trans_args_wug_gendered

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED SCORE: -4.2707

[2021-02-03 00:07:34,858 INFO] 
SENT 2: ['b', 'r', 'i', ':', 'g', 'a', 'd', 'i', ':', 'e', ':']
PRED 2: r i : g a d i : e : b i : r @ n
PRED SCORE: -6.2172

[2021-02-03 00:07:34,858 INFO] 
SENT 3: ['t', 'O', 'n', 'a', ':', 'Z', '@']
PRED 3: n a : Z @ n
PRED SCORE: -3.5034

[2021-02-03 00:07:34,858 INFO] 
SENT 4: ['k', 'r', 'O', 'y', 't', 's', 't', 's', 'u', ':', 'k']
PRED 4: r O y t s t s u : k @
PRED SCORE: -4.4751

[2021-02-03 00:07:34,858 INFO] 
SENT 5: ['t', 'o', ':', 'r']
PRED 5: t r o : r o : t o : t @
PRED SCORE: -4.4189

[2021-02-03 00:07:34,858 INFO] 
SENT 6: ['b', 'I', 'r', 'k', '@']
PRED 6: I r k @ n
PRED SCORE: -1.3158

[2021-02-03 00:07:34,859 INFO] 
SENT 7: ['g', 'a', 'i', 'g', '@']
PRED 7: a i g @ n
PRED SCORE: -3.9143

[2021-02-03 00:07:34,859 INFO] 
SENT 8: ['g', 'U', 'n', 's', 't']
PRED 8: Y n s t @
PRED SCORE: -3.7269

[2021-02-03 00:07:34,859 INFO] 
SENT 9: ['r', 'u', ':', 'r']
PRE

##Evaluate German

In [None]:
from collections import Counter, defaultdict
from scipy.stats.stats import spearmanr   
def get_pearson(f1, f2):
  # print(f1,f2)
  a = [ord(s[-1]) for s in f1]
  b = [ord(s[-1]) for s in f2]
  # print(f1,f2)
  return spearmanr(a,b), sum(1 for x,y in zip(a,b) if x == y) / len(a)

# print('Data Type:', data_type)

def analyze_by_inflection(preds):
  inflections = [p[-1] if len(p) > 1 else '-' for p in preds]
  s_r = sum([t == 's' for t in inflections])/len(inflections)
  print('-s', s_r)
  e_r = sum([t == '@' for t in inflections])/len(inflections)
  print('-e', e_r)
  r_r = sum([t == 'r' for t in inflections])/len(inflections)
  print('-r', r_r) 
  n_r = sum([t == 'n' for t in inflections])/len(inflections)
  print('-n', n_r)
  other = [t for t in inflections if t not in ['n','@','r','s']]
  o_r = len(other)/len(inflections)
  other_most_common = Counter(other).most_common(5)
  print('Other:', o_r, other_most_common)
  return s_r, e_r, r_r, n_r, o_r, other_most_common


def frequency_test(train_pairs, srclines, predlines, c=1):
  # Frequency test
  # For each of the verbs in the test set, get the original ending. Then, get the predictions for number of times that ending appears 
  nouns_with_inflections_matching_train = [] 
  for i,noun in enumerate(srclines):
    noun = noun.replace(' ','')
    ending = noun[-c:]
    predicted_inflection = predlines[i].replace(' ','')[-c:] if len(predlines[i]) > 1 else '-'
    train_pairs_with_same_ending = [(s,t) for s,t in train_pairs if s[-c:] == ending]
    # get most frequent inflection for the train data verbs that share the ending
    inflections = [t[-c:] for s,t in train_pairs_with_same_ending]
    if not inflections: continue
    popular_inflection = Counter(inflections).most_common(1)[0][0]
    if predicted_inflection == popular_inflection:
      nouns_with_inflections_matching_train.append((noun, predicted_inflection, popular_inflection))

  print(f'% of inflections (len {c}) that match most popular in training:', len(nouns_with_inflections_matching_train)/len(srclines))
  return len(nouns_with_inflections_matching_train)/len(srclines)

results = []
# for data_type in ['celex', 'celex_token_freq']:
for data_type in ['celex']:
  print(f'\nData type: {data_type}')
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/German/{data_type}'
  
  train_tgt_lines = open(f'{datadir}/german-tgt-train.txt','r').read().splitlines()
  train_src_lines = open(f'{datadir}/german-src-train.txt','r').read().splitlines()
  train_pairs = list(zip([t.replace(' ','') for t in train_src_lines], 
                       [t.replace(' ','') for t in train_tgt_lines]))
  
  cor_lists = defaultdict(lambda: defaultdict(defaultdict))

  for model in ['rnn','transformer']:
    srclines = open(f'{datadir}/german-src-test.txt','r').read().splitlines()
    for condition, tgtlines, predlines in [('regular_test_wug_with_gender', open(f'{datadir}/german-tgt-test.txt','r').read().splitlines(), open(f'{outdir}/german-{model}-{data_type}-pred.txt','r').read().splitlines()),
                                            ('wug-no-gender', open(f'{datadir}/german-wug.txt','r').read().splitlines(), open(f'{outdir}/german-{model}-{data_type}-pred-wug.txt','r').read().splitlines()),
                                            ('wug-all-neutral', open(f'{datadir}/german-wug-gendered.txt','r').read().splitlines(), open(f'{outdir}/german-{model}-{data_type}-pred-wug-gendered.txt','r').read().splitlines())]:
      print(f'\n\n{model} {condition} results')
      predlines = [p if len(p) > 1 else '-' for p in predlines]
      tups = list(zip(srclines,tgtlines,predlines))

      m = []
      f = []
      n = []
      for src,tgt,pred in tups:
        # get the learned inflection
        src,tgt,pred = src.strip(), tgt.strip(), pred.strip()
        gender = src.split()[0]
        if gender == 'MAS': m.append((src,tgt,pred))
        elif gender == 'FEM': f.append((src,tgt,pred))
        elif gender == 'NTR': n.append((src,tgt,pred))

      accuracy = sum([t[1][-1]==t[2][-1] for t in tups])/len(tups)
      freq = frequency_test(train_pairs, srclines, predlines)
      print('Test accuracy:', accuracy) 
      ans = analyze_by_inflection(predlines)
      results.append(('ger_'+data_type, model.upper(), condition, 'All', accuracy, freq, *ans)) 
      cor_lists[condition]['all'][model] = predlines 

      m_acc = sum([t[1][-1]==t[2][-1] for t in m])/len(m)
      l1,_, l2 = zip(*m)
      freq = frequency_test(train_pairs, l1, l2)
      print('M Accuracy:',m_acc)
      ans = analyze_by_inflection(t[2] for t in m)
      results.append(('ger_'+data_type, model.upper(), condition, 'M', m_acc, freq, *ans))
      cor_lists[condition]['M'][model] = [s[-1] for s in m]

      f_acc = sum([t[1][-1]==t[2][-1] for t in f])/len(f)
      l1,_, l2 = zip(*f)
      freq = frequency_test(train_pairs, l1, l2)
      print('F Accuracy:',f_acc)
      ans = analyze_by_inflection(t[2] for t in f)
      results.append(('ger_'+data_type, model.upper(), condition, 'F', f_acc, freq, *ans))
      cor_lists[condition]['F'][model] = [s[-1] for s in f]

      n_acc = sum([t[1][-1]==t[2][-1] for t in n])/len(n)
      l1,_, l2 = zip(*n)
      freq = frequency_test(train_pairs, l1, l2)
      print('N Accuracy:',n_acc)
      ans = analyze_by_inflection(t[2] for t in n)
      results.append(('ger_'+data_type, model.upper(), condition, 'N', n_acc, freq, *ans))
      cor_lists[condition]['N'][model] = [s[-1] for s in n]

  print('\n\nCorrelations')
  for k,v in cor_lists.items():
    for k2, v2 in v.items():
      lists = list(v2.values())
      print(k, k2, get_pearson(lists[0],lists[1]))

print()
print('data_type, model.upper(), condition, test_subset, acc, freq, s, e, r, n, others, others_most_common')
for r in results:
  print(','.join(str(x).replace(',','') for x in r))




Data type: celex


rnn regular_test_wug_with_gender results
% of inflections (len 1) that match most popular in training: 0.75
Test accuracy: 0.838
-s 0.086
-e 0.346
-r 0.03
-n 0.536
Other: 0.002 [(':', 1)]
% of inflections (len 1) that match most popular in training: 0.6871165644171779
M Accuracy: 0.7361963190184049
-s 0.12883435582822086
-e 0.6871165644171779
-r 0.012269938650306749
-n 0.1656441717791411
Other: 0.006134969325153374 [(':', 1)]
% of inflections (len 1) that match most popular in training: 0.8218623481781376
F Accuracy: 0.9554655870445344
-s 0.020242914979757085
-e 0.032388663967611336
-r 0.0
-n 0.9473684210526315
Other: 0.0 []
% of inflections (len 1) that match most popular in training: 0.6666666666666666
N Accuracy: 0.7
-s 0.18888888888888888
-e 0.5888888888888889
-r 0.14444444444444443
-n 0.07777777777777778
Other: 0.0 []


rnn wug-no-gender results
% of inflections (len 1) that match most popular in training: 0.146
Test accuracy: 0.264
-s 0.102
-e 0.148
-r 0.056
-

<br>
<br>

#Russian

## Generate Datasets

In [None]:
# datatypes = [f'{d}_{i}'for d in ['all-verbs', '1st-or-2nd-Conj','1st-Conj','dental','dental-2nd-Conj','2nd-Conj'] for i in range(5)]
# datatypes.extend([f'{d}_100'for d in ['all-verbs', '1st-or-2nd-Conj','1st-Conj','dental','dental-2nd-Conj','2nd-Conj']])
datatypes = [f'{d}_100'for d in ['all-verbs', '1st-or-2nd-Conj','1st-Conj','dental-2nd-Conj','2nd-Conj']]
# datatypes = [f'{d}_{i}'for d in ['all-verbs'] for i in range(5)]

In [None]:
import numpy as np
import random
import operator
import math
import re

defectives = []
with open('drive/MyDrive/Rule_Learning_Thesis/Russian/defective-verbs.txt', 'r') as inp:
  lines = inp.readlines()
  for i in range(1, len(lines)):
    line = lines[i]
    if i%4==1:
      verb = line.strip().split(',')[0]
      # Remove pertsrova 2016 paper verbs
      if verb not in ['кучить', 'ладить', 'отчудить', 'переубедить','погалдеть','потчудить','предубедить','разубедить']:
        defectives.append(verb)
# 61 defective verbs
print(len(defectives))

for datatype in datatypes:
  print(datatype)
  new_data_path = f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}/'
  if not os.path.exists(new_data_path):
    os.makedirs(new_data_path)

  # # dental verbs to sing
  verb2sing = {}
  with open('drive/MyDrive/Rule_Learning_Thesis/Russian/verbs.csv', 'r') as inp:
    lines = inp.readlines()
    # print(len(lines))
    for i in range(1,len(lines)):
      tokens = lines[i].split('\t')
      bare = tokens[0]
      accented = tokens[1]
      english = tokens[2]
      fsg = tokens[12]
      
      j = 0
      while j < len(accented) and j < len(fsg) and accented[j] == fsg[j]: j += 1
      common = fsg[:j]
      inflection = fsg[j:]
      if inflection == '': continue

      if 'dental-2nd-Conj' == datatype.split('_')[0]:
        search = re.search('(.)[еи]ть(ся)?$',bare) 
        if bool(search) and search.group(1) in ['т', 'д', 'с', 'з']:
          # accented form, 1sg, common part, inflection
          verb2sing[bare] = (accented, fsg, common, inflection)
      
      elif 'dental' == datatype.split('_')[0]:
        search = re.search('(.)[аеи]ть(ся)?$',bare)
        if bool(search) and search.group(1) in ['т', 'д', 'с', 'з']:
          verb2sing[bare] = (accented, fsg, common, inflection)
      
      elif '2nd-Conj' == datatype.split('_')[0]:
        search = re.search('(.)[еи]ть(ся)?$',bare)
        if bool(search):
          verb2sing[bare] = (accented, fsg, common, inflection)
            
      elif '1st-Conj' == datatype.split('_')[0]:
        search = re.search('(.)[а]ть(ся)?$',bare)
        if bool(search):
          verb2sing[bare] = (accented, fsg, common, inflection)

      elif '1st-or-2nd-Conj' == datatype.split('_')[0]:
        search = re.search('(.)[аеи]ть(ся)?$',bare)
        if bool(search):
          verb2sing[bare] = (accented, fsg, common, inflection)

      elif 'all-verbs' == datatype.split('_')[0]:
        verb2sing[bare] = (accented, fsg, common, inflection)
  
  # verbs = [(k,v[1]) for k,v in verb2sing.items() if k not in defectives and k in unimorph_verbs]
  def is_defective(k):
    return any([d in k for d in defectives])
               
  verbs = [(k,v[1]) for k,v in verb2sing.items() if not is_defective(k)]

  print(len(verbs))
  print('Removed:',len([k for k,v in verb2sing.items() if is_defective(k)]))
  random.seed(1)
  random.shuffle(verbs)
  # if 'all-verbs' != datatype.split('_')[0]:
  #   size = 1200 
  #   verbs = verbs[:size]
  random.seed(int(datatype.split('_')[1]))
  random.shuffle(verbs)

  with open(f'{new_data_path}russian-src-train.txt','w') as src:
    with open(f'{new_data_path}russian-tgt-train.txt','w') as tgt:
      for inf,fsg in verbs[0:int(0.8*len(verbs))]:
        src.write(' '.join(list(inf))+'\n')
        tgt.write(' '.join(list(fsg))+'\n')
  with open(f'{new_data_path}russian-src-val.txt','w') as src:
    with open(f'{new_data_path}russian-tgt-val.txt','w') as tgt:
      for inf,fsg in verbs[int(0.8*len(verbs)):int(0.9*len(verbs))]:
        src.write(' '.join(list(inf))+'\n')
        tgt.write(' '.join(list(fsg))+'\n')
  with open(f'{new_data_path}russian-src-test.txt','w') as src:
    with open(f'{new_data_path}russian-tgt-test.txt','w') as tgt:
      for inf,fsg in verbs[int(0.9*len(verbs)):]:
        src.write(' '.join(list(inf))+'\n')
        tgt.write(' '.join(list(fsg))+'\n')

  with open(f'{new_data_path}russian-defectives.txt','w') as f:
    for d in defectives:      
      f.write(' '.join(list(d))+'\n')



61
all-verbs_100
13380
Removed: 84
1st-or-2nd-Conj_100
11592
Removed: 84
1st-Conj_100
7493
Removed: 0
dental-2nd-Conj_100
1166
Removed: 83
2nd-Conj_100
4099
Removed: 84


##Preprocess Russian Data

In [None]:
for datatype in datatypes:
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}'
  !python OpenNMT-py/preprocess.py -train_src $datadir/russian-src-train.txt -train_tgt $datadir/russian-tgt-train.txt -valid_src $datadir/russian-src-val.txt -valid_tgt $datadir/russian-tgt-val.txt -save_data $datadir/processed

##Russian RNN

In [None]:
for datatype in datatypes:
  # if 'all-verbs' not in datatype: continue
  print(datatype)
  epochs, batchsize,  = 100, 20
  with open(f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}/russian-src-train.txt') as inp:
    n_examples = len(inp.readlines())
    print(n_examples)
  steps = str(int(epochs * n_examples / batchsize))
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}'

  rnn_modelpath = f'{outdir}/russian_{datatype}_rnn_model'
  rnn_train_args = ' '.join([
    f'-data {datadir}/processed',
    '-save_model '+rnn_modelpath,
    '-enc_layers 2',
    '-dec_layers 2',
    '-rnn_size 100',
    '-batch_size 20',
    '-word_vec_size 300',
    '-gpu_ranks 0',
    '-train_steps '+steps,
    '-save_checkpoint_steps '+steps
    ])

  rnn_trans_args = ' '.join([
    '-model '+rnn_modelpath+'_step_'+steps+'.pt',
    f'-src {datadir}/russian-src-test.txt',
    f'-output {outdir}/russian_{datatype}-rnn-pred.txt',
    f'-log_file {outdir}/russian_{datatype}-rnn-pred.log',
    '-replace_unk -verbose',
    '-beam_size 12'
    ])

  rnn_trans_args_def = ' '.join([
    '-model '+rnn_modelpath+'_step_'+steps+'.pt',
    f'-src {datadir}/russian-defectives.txt',
    f'-output {outdir}/russian_{datatype}-rnn-pred-defectives.txt',
    f'-log_file {outdir}/russian_{datatype}-rnn-pred-defectives.log',
    '-replace_unk -verbose',
    '-beam_size 12'
    ])

  # !python OpenNMT-py/train.py $rnn_train_args
  !python OpenNMT-py/translate.py $rnn_trans_args
  !python OpenNMT-py/translate.py $rnn_trans_args_def

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2021-05-01 02:32:16,880 INFO] 
SENT 464: ['з', 'а', 'б', 'и', 'н', 'т', 'о', 'в', 'а', 'т', 'ь']
PRED 464: з а б и н т у ' ю
PRED SCORE: -0.0001

[2021-05-01 02:32:16,880 INFO] 
SENT 465: ['д', 'о', 'и', 'с', 'к', 'и', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 465: д о и ' с к и в а ю с ь
PRED SCORE: -0.0002

[2021-05-01 02:32:16,880 INFO] 
SENT 466: ['з', 'а', 'р', 'а', 'б', 'а', 'т', 'ы', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 466: з а р а б а ' т ы в а ю с ь
PRED SCORE: -0.0006

[2021-05-01 02:32:16,880 INFO] 
SENT 467: ['в', 'з', 'б', 'и', 'р', 'а', 'т', 'ь', 'с', 'я']
PRED 467: в з б и р а ' ю с ь
PRED SCORE: -0.0001

[2021-05-01 02:32:16,881 INFO] 
SENT 468: ['о', 'т', 'к', 'у', 'с', 'ы', 'в', 'а', 'т', 'ь']
PRED 468: о т к у ' с ы в а ю
PRED SCORE: -0.0001

[2021-05-01 02:32:16,881 INFO] 
SENT 469: ['о', 'к', 'а', 'п', 'ы', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 469: о к а ' п ы в а ю с ь
PRED SCORE: -0.0001

[2021-05-01 02:32:1

##Russian Transformer

In [None]:
import math

for datatype in datatypes:
  # if 'all-verbs' not in datatype: continue
  print(datatype)
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}'
  # Transformer Setup
  transformer_modelpath = f'{outdir}/russian_{datatype}_transformer_model'
  with open(f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}/russian-src-train.txt') as inp:
    n_examples = len(inp.readlines())
  steps = int(20000 * n_examples/4000)
  transformer_train_args = ' '.join(f'''-data {datadir}/processed -gpu_ranks 0
  -save_model {transformer_modelpath} -layers 2 -rnn_size 312
  -word_vec_size 312 -transformer_ff 512 -heads 8 
  -encoder_type transformer -decoder_type transformer -position_encoding 
  -train_steps {steps} -max_generator_batches 2 -dropout 0.1 
  -batch_size 20 -batch_type tokens -normalization tokens 
  -accum_count 2 -optim adam -adam_beta2 0.998 
  -decay_method noam -warmup_steps 1000 -learning_rate 0.05 -max_grad_norm 0 
  -param_init 0 -param_init_glorot -label_smoothing 0.1 
  -valid_steps 1000 -save_checkpoint_steps 5000 -world_size 1'''.split())

  transformer_trans_args = ' '.join([
    '-model '+transformer_modelpath+f'_step_{steps}.pt',
    f'-src {datadir}/russian-src-test.txt',
    f'-output {outdir}/russian_{datatype}-transformer-pred.txt',
    f'-log_file {outdir}/russian_{datatype}-transformer-pred.log',
    '-replace_unk -verbose',
    '-beam_size 12'
    ])

  transformer_trans_args_def = ' '.join([
    '-model '+transformer_modelpath+f'_step_{steps}.pt',
    f'-src {datadir}/russian-defectives.txt',
    f'-output {outdir}/russian_{datatype}-transformer-pred-defectives.txt',
    f'-log_file {outdir}/russian_{datatype}-transformer-pred-defectives.log',
    '-replace_unk -verbose',
    '-beam_size 12'
    ])
    
  # !python OpenNMT-py/train.py $transformer_train_args
  !python OpenNMT-py/translate.py $transformer_trans_args
  !python OpenNMT-py/translate.py $transformer_trans_args_def

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED SCORE: -0.5687

[2021-05-01 02:31:31,086 INFO] 
SENT 464: ['з', 'а', 'б', 'и', 'н', 'т', 'о', 'в', 'а', 'т', 'ь']
PRED 464: з а б и н т у ' ю
PRED SCORE: -0.4748

[2021-05-01 02:31:31,086 INFO] 
SENT 465: ['д', 'о', 'и', 'с', 'к', 'и', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 465: д о и ' с к и в а ю с ь
PRED SCORE: -0.8618

[2021-05-01 02:31:31,087 INFO] 
SENT 466: ['з', 'а', 'р', 'а', 'б', 'а', 'т', 'ы', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 466: з а р а б а ' т ы в а ю с ь
PRED SCORE: -0.7867

[2021-05-01 02:31:31,087 INFO] 
SENT 467: ['в', 'з', 'б', 'и', 'р', 'а', 'т', 'ь', 'с', 'я']
PRED 467: в з б и р а ' ю с ь
PRED SCORE: -0.6043

[2021-05-01 02:31:31,087 INFO] 
SENT 468: ['о', 'т', 'к', 'у', 'с', 'ы', 'в', 'а', 'т', 'ь']
PRED 468: о т к у ' с ы в а ю
PRED SCORE: -0.5425

[2021-05-01 02:31:31,087 INFO] 
SENT 469: ['о', 'к', 'а', 'п', 'ы', 'в', 'а', 'т', 'ь', 'с', 'я']
PRED 469: о к а ' п ы в а ю с ь
PRED SCORE: -0.5945

##**Evaluate** Russian

In [None]:
from collections import Counter, defaultdict
from scipy.stats.stats import spearmanr   
def get_pearson(f1, f2):
  # print(f1,f2)
  a = [(s[-2:]) for s in f1]
  b = [(s[-2:]) for s in f2]
  # print(f1,f2)
  return spearmanr(a,b), sum(1 for x,y in zip(a,b) if x == y) / len(a)

def frequency_test(train_pairs, srclines, predlines, c=1):
  # Frequency test
  # For each of the verbs in the test set, get the original ending. Then, get the predictions for number of times that ending appears 
  nouns_with_inflections_matching_train = [] 
  for i,noun in enumerate(srclines):
    noun = noun.replace(' ','')
    ending = noun[-c:]
    predicted_inflection = predlines[i].replace(' ','')[-c:] if len(predlines[i]) > 1 else '-'
    train_pairs_with_same_ending = [(s,t) for s,t in train_pairs if s[-c:] == ending]
    # get most frequent inflection for the train data verbs that share the ending
    inflections = [t[-c:] for s,t in train_pairs_with_same_ending]
    if not inflections: continue
    popular_inflection = Counter(inflections).most_common(1)[0][0]
    if predicted_inflection == popular_inflection:
      nouns_with_inflections_matching_train.append((noun, predicted_inflection, popular_inflection))

  result = len(nouns_with_inflections_matching_train)/len(srclines)
  print(f'% of inflections (len {c}) that match most popular in training:', result)
  return result

cor_lists = defaultdict(defaultdict)

results = []
for datatype in datatypes:
  print('\n\n',datatype)
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/Russian/data/{datatype}'
  train_tgt_lines = open(f'{datadir}/russian-tgt-train.txt','r').read().splitlines()
  train_src_lines = open(f'{datadir}/russian-src-train.txt','r').read().splitlines()
  train_pairs = list(zip([t.replace(' ','') for t in train_src_lines], 
                        [t.replace(' ','') for t in train_tgt_lines]))



  for model in ['rnn','transformer']:
    print('\n',model, 'results')
    predlines = open(f'{outdir}/russian_{datatype}-{model}-pred.txt','r').read().splitlines()
    tgtlines = open(f'{datadir}/russian-tgt-test.txt','r').read().splitlines()
    srclines = open(f'{datadir}/russian-src-test.txt','r').read().splitlines()
    
    tups = list(zip(tgtlines,predlines))

    r = []
    for tgt,pred in tups:
      # get the learned inflection
      gold, learned = tgt.strip(),pred.strip()
      r.append((gold,learned))

    test_accuracy = sum([t[0]==t[1] for t in r])/len(r)
    print('Test accuracy:', test_accuracy)
    freq_match = frequency_test(train_pairs, srclines, predlines)

    ##### Gap Test
    deflines = open(f'{datadir}/russian-defectives.txt','r').read().splitlines()
    defpredlines = open(f'{outdir}/russian_{datatype}-{model}-pred-defectives.txt','r').read().splitlines()
    print('D TM', frequency_test(train_pairs, deflines, defpredlines))

    deftups = list(zip(deflines,defpredlines))
    for t in deftups:
      bare,pred = ''.join(t[0].split()),''.join(t[1].split())
      if bare and pred:
        if bare[0] == pred[0]:
          # print(bare, '-', pred)
          continue

    results.append(f'{datatype}, {model.upper()}, {test_accuracy}, {freq_match}')
    cor_lists[datatype][model] = predlines

print('\n\nCorrelations')
for k,v in cor_lists.items():
  lists = list(v.values())
  print(k, get_pearson(lists[0],lists[1]))

print()
print('datatype, model, test_accuracy, freq_match')
for r in results:
  print(r)



 all-verbs_100

 rnn results
Test accuracy: 0.8849028400597907
% of inflections (len 1) that match most popular in training: 0.7638266068759342
% of inflections (len 1) that match most popular in training: 0.09836065573770492
D TM 0.09836065573770492

 transformer results
Test accuracy: 0.8213751868460388
% of inflections (len 1) that match most popular in training: 0.7802690582959642
% of inflections (len 1) that match most popular in training: 0.09836065573770492
D TM 0.09836065573770492


 1st-or-2nd-Conj_100

 rnn results
Test accuracy: 0.821551724137931
% of inflections (len 1) that match most popular in training: 0.7698275862068965
% of inflections (len 1) that match most popular in training: 0.13114754098360656
D TM 0.13114754098360656

 transformer results
Test accuracy: 0.8310344827586207
% of inflections (len 1) that match most popular in training: 0.7810344827586206
% of inflections (len 1) that match most popular in training: 0.11475409836065574
D TM 0.11475409836065574



<br>
<br>

#English

## Generate Datasets

In [None]:
!cd drive/MyDrive/Rule_Learning_Thesis/English; python parse_celex.py

266
4734
8760
Token freq data type count: 1908
Done!


##Parameter Setup

In [None]:
# RNN Settup
data_type = ''
# epochs, n_examples, batchsize,  = 100, 4000, 20
# steps = str(int(epochs * n_examples / batchsize))

# data_type = '_token_freq'
epochs, n_examples, batchsize,  = 100, 4000, 20
steps = str(int(epochs * n_examples / batchsize))

datadir = f'drive/MyDrive/Rule_Learning_Thesis/English/data{data_type}'
	
rnn_modelpath = f'{outdir}/english_rnn_model{data_type}'
rnn_train_args = ' '.join([
	f'-data {datadir}/processed',
	'-save_model '+rnn_modelpath,
	'-enc_layers 2',
	'-dec_layers 2',
	'-rnn_size 100',
	'-batch_size 20',
	'-word_vec_size 300',
	'-gpu_ranks 0',
	'-train_steps '+steps,
	'-save_checkpoint_steps '+steps
	])

# I.e wug test
rnn_trans_args = ' '.join([
	 '-model '+rnn_modelpath+'_step_'+steps+'.pt',
	 f'-src {datadir}/english-src-test.txt',
	 f'-output {outdir}/english-rnn{data_type}-pred.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

# I.e wug test for irregulars
rnn_trans_args_irr = ' '.join([
	 '-model '+rnn_modelpath+'_step_'+steps+'.pt',
	 f'-src {datadir}/english-irr-src-test.txt',
	 f'-output {outdir}/english-rnn{data_type}-pred-irr.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])


# Transformer Setup
transformer_modelpath = f'{outdir}/english_transformer_model{data_type}'
steps = 20000
# -rnn_size is transformer model size (d_model), t_ff is feedforward layer
transformer_train_args = ' '.join(f'''-data {datadir}/processed -gpu_ranks 0
-save_model {transformer_modelpath} -layers 2 -rnn_size 312
-word_vec_size 312 -transformer_ff 512 -heads 8 
-encoder_type transformer -decoder_type transformer -position_encoding 
-train_steps {steps} -max_generator_batches 2 -dropout 0.1 
-batch_size 20 -batch_type tokens -normalization tokens 
-accum_count 2 -optim adam -adam_beta2 0.998 
-decay_method noam -warmup_steps 1000 -learning_rate 0.05 -max_grad_norm 0 
-param_init 0 -param_init_glorot -label_smoothing 0.1 
-valid_steps 1000 -save_checkpoint_steps 5000 -world_size 1'''.split())

transformer_trans_args = ' '.join([
	 '-model '+transformer_modelpath+f'_step_{steps}.pt',
	 f'-src {datadir}/english-src-test.txt',
	 f'-output {outdir}/english-transformer{data_type}-pred.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])

transformer_trans_args_irr = ' '.join([
	 '-model '+transformer_modelpath+f'_step_{steps}.pt',
	 f'-src {datadir}/english-irr-src-test.txt',
	 f'-output {outdir}/english-transformer{data_type}-pred-irr.txt',
	 '-replace_unk -verbose',
	 '-beam_size 12'
	])


In [None]:
!python OpenNMT-py/preprocess.py -train_src $datadir/english-src-train.txt -train_tgt $datadir/english-tgt-train.txt -valid_src $datadir/english-src-val.txt -valid_tgt $datadir/english-tgt-val.txt -save_data $datadir/processed

[2021-02-02 17:35:10,961 INFO] Extracting features...
[2021-02-02 17:35:10,963 INFO]  * number of source features: 0.
[2021-02-02 17:35:10,963 INFO]  * number of target features: 0.
[2021-02-02 17:35:10,963 INFO] Building `Fields` object...
[2021-02-02 17:35:10,964 INFO] Building & saving training data...
[2021-02-02 17:35:10,965 INFO] Building & saving validation data...


##Train RNN

In [None]:
!python OpenNMT-py/train.py $rnn_train_args

[2021-02-02 17:35:17,764 INFO]  * src vocab size = 40
[2021-02-02 17:35:17,764 INFO]  * tgt vocab size = 42
[2021-02-02 17:35:17,764 INFO] Building model...
[2021-02-02 17:35:21,806 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(40, 300, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(300, 100, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(42, 300, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(400, 100)
        (1): LSTMCell(100, 100)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=100, out_features=100, bias=False)
      (linear_out)

In [None]:
!python OpenNMT-py/translate.py $rnn_trans_args
!python OpenNMT-py/translate.py $rnn_trans_args_irr

[2021-01-31 23:03:08,434 INFO] Translating shard 0.
[2021-01-31 23:03:08,541 INFO] 
SENT 1: ['p', 'I', 'l', 'f', '@', 'r', '*']
PRED 1: p I l f @ d
PRED SCORE: -0.0042

[2021-01-31 23:03:08,542 INFO] 
SENT 2: ['f', 'r', 'V', 's', 't', 'r', 'e', 'I', 't']
PRED 2: f r V s t r e I t I d
PRED SCORE: -0.0006

[2021-01-31 23:03:08,542 INFO] 
SENT 3: ['s', 't', 'V', 'm', 'b', 'l', ',']
PRED 3: s t V m b l , d
PRED SCORE: -0.0000

[2021-01-31 23:03:08,542 INFO] 
SENT 4: ['p', '&', 'k', 'I', 'd', 'Z']
PRED 4: p & k I d Z d
PRED SCORE: -0.0000

[2021-01-31 23:03:08,542 INFO] 
SENT 5: ['E', 'd', 'Z']
PRED 5: E d Z d
PRED SCORE: -0.0002

[2021-01-31 23:03:08,543 INFO] 
SENT 6: ['s', 'p', '&', 'N', 'k']
PRED 6: s p & N k t
PRED SCORE: -0.0001

[2021-01-31 23:03:08,543 INFO] 
SENT 7: ['s', 't', 'I', 'N', 'k']
PRED 7: s t I N k t
PRED SCORE: -0.0178

[2021-01-31 23:03:08,543 INFO] 
SENT 8: ['k', 'O', ':', 't', '@', 'r', 'a', 'I', 'z']
PRED 8: k O : t @ r a I z d
PRED SCORE: -0.6131

[2021-01-31 23:03

##Train Transformer

In [None]:
!python OpenNMT-py/train.py $transformer_train_args

[2021-01-31 23:03:13,283 INFO]  * src vocab size = 40
[2021-01-31 23:03:13,283 INFO]  * tgt vocab size = 42
[2021-01-31 23:03:13,283 INFO] Building model...
[2021-01-31 23:03:17,740 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(40, 312, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=312, out_features=312, bias=True)
          (linear_values): Linear(in_features=312, out_features=312, bias=True)
          (linear_query): Linear(in_features=312, out_features=312, bias=True)
          (softmax): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
          (final_linear): Linear(in_features=312, out_features=312,

In [None]:
!python OpenNMT-py/translate.py $transformer_trans_args
!python OpenNMT-py/translate.py $transformer_trans_args_irr

[2021-01-31 23:20:04,903 INFO] Translating shard 0.
[2021-01-31 23:20:05,504 INFO] 
SENT 1: ['p', 'I', 'l', 'f', '@', 'r', '*']
PRED 1: p I l f @ d
PRED SCORE: -0.3981

[2021-01-31 23:20:05,504 INFO] 
SENT 2: ['f', 'r', 'V', 's', 't', 'r', 'e', 'I', 't']
PRED 2: f r V s t r e I t I d
PRED SCORE: -0.6776

[2021-01-31 23:20:05,505 INFO] 
SENT 3: ['s', 't', 'V', 'm', 'b', 'l', ',']
PRED 3: s t V m b l , d
PRED SCORE: -0.3809

[2021-01-31 23:20:05,505 INFO] 
SENT 4: ['p', '&', 'k', 'I', 'd', 'Z']
PRED 4: p & k I d Z d
PRED SCORE: -0.4162

[2021-01-31 23:20:05,505 INFO] 
SENT 5: ['E', 'd', 'Z']
PRED 5: E d Z d
PRED SCORE: -0.3304

[2021-01-31 23:20:05,505 INFO] 
SENT 6: ['s', 'p', '&', 'N', 'k']
PRED 6: s p & N k t
PRED SCORE: -0.2809

[2021-01-31 23:20:05,505 INFO] 
SENT 7: ['s', 't', 'I', 'N', 'k']
PRED 7: s t I N k t
PRED SCORE: -0.4531

[2021-01-31 23:20:05,506 INFO] 
SENT 8: ['k', 'O', ':', 't', '@', 'r', 'a', 'I', 'z']
PRED 8: k O : t @ r a I z d
PRED SCORE: -0.5293

[2021-01-31 23:20

##Evaluate English

In [None]:
from collections import Counter
from scipy.stats.stats import spearmanr   
def get_pearson(f1, f2):
  # print(f1,f2)
  a = [ord(s[-1]) for s in f1]
  b = [ord(s[-1]) for s in f2]
  # print(f1,f2)
  return spearmanr(a,b), sum(1 for x,y in zip(a,b) if x == y) / len(a)

results = []
for data_type in ['','_token_freq']:
  cor_lists = defaultdict(defaultdict)

  print('\nData type', data_type)
  datadir = f'drive/MyDrive/Rule_Learning_Thesis/English/data{data_type}'

  train_tgt_lines = open(f'{datadir}/english-tgt-train.txt','r').read().splitlines()
  train_src_lines = open(f'{datadir}/english-src-train.txt','r').read().splitlines()
    
  train_pairs = list(zip([t.replace(' ','') for t in train_src_lines], 
                        [t.replace(' ','') for t in train_tgt_lines]))

  for model in ['rnn','transformer']:
    print(f'\n{model.upper()}')
    
    for case in ['','-irr']:
      print(f'{model.upper()} {case} test results')
      predlines = open(f'{outdir}/english-{model}{data_type}-pred{case}.txt','r').read().splitlines()
      test_src_lines = open(f'{datadir}/english{case}-src-test.txt','r').read().splitlines()
      test_tgt_lines = open(f'{datadir}/english{case}-tgt-test.txt','r').read().splitlines()
      tups = list(zip(test_src_lines,test_tgt_lines,predlines))

      r = []
      for tst,tgt,pred in tups:
        # get the learned inflection
        gold, learned = tgt.strip(),pred.strip()
        r.append((gold,learned,tst.strip()))

      test_accuracy = sum([t[0]==t[1] for t in r])/len(r)
      print('Test accuracy:', test_accuracy)
      ed_predicted = sum([t[1][-1] in ['d','t'] and t[1]!=t[2] for t in r])/len(r)
      print('-ed predicted:', ed_predicted)
      cor_lists['eng'+case][model] = [t[1] for t in r]
      # Frequency test
      # For each of the verbs in the test set, get the original ending. Then, get the predictions for number of times that ending appears 
      # if case == '-irr':
      c = 1
      verbs_with_inflections_matching_train = [] 
      for i,verb in enumerate(test_src_lines):
        verb = verb.replace(' ','')
        ending = verb[-c:]
        predicted_inflection = predlines[i].replace(' ','')[-c:]
        train_pairs_with_same_ending = [(s,t) for s,t in train_pairs if s[-c:] == ending]
        # get most frequent inflection for the train data verbs that share the ending
        inflections = [t[-c:] for s,t in train_pairs_with_same_ending]
        if not inflections: continue
        popular_inflection = Counter(inflections).most_common(1)[0][0]
        if predicted_inflection == popular_inflection:
          verbs_with_inflections_matching_train.append((verb, predicted_inflection, popular_inflection))

      train_match = len(verbs_with_inflections_matching_train)/len(test_src_lines)
      print(f'% of inflections (len {c}) that match most popular in training', train_match)
      results.append(('eng'+data_type, model.upper(), case, test_accuracy, ed_predicted, train_match))
  
  print('\n\nCorrelations')
  for k,v in cor_lists.items():
    lists = list(v.values())
    print(k, get_pearson(lists[0],lists[1]))

print()
print('data_type, model.upper(), case, test_accuracy, ed_predicted, train_match')
for r in results:
  print(','.join(str(x) for x in r))


Data type 

RNN
RNN  test results
Test accuracy: 0.802
-ed predicted: 0.976
% of inflections (len 1) that match most popular in training 0.944
RNN -irr test results
Test accuracy: 0.18518518518518517
-ed predicted: 0.9629629629629629
% of inflections (len 1) that match most popular in training 0.8518518518518519

TRANSFORMER
TRANSFORMER  test results
Test accuracy: 0.94
-ed predicted: 0.996
% of inflections (len 1) that match most popular in training 0.998
TRANSFORMER -irr test results
Test accuracy: 0.07407407407407407
-ed predicted: 0.9259259259259259
% of inflections (len 1) that match most popular in training 0.9629629629629629


Correlations
eng (SpearmanrResult(correlation=0.8814518753853616, pvalue=2.2710281892877805e-164), 0.946)
eng-irr (SpearmanrResult(correlation=0.7372828281515739, pvalue=1.1490292771105438e-05), 0.8888888888888888)

Data type _token_freq

RNN
RNN  test results
Test accuracy: 0.79
-ed predicted: 0.894
% of inflections (len 1) that match most popular in tra