In [113]:
import numpy as np
import os
from matplotlib import pyplot as plt
import glob
from collections import Counter
from sklearn.linear_model import LinearRegression
# from lmfit.models import Model
import re
from sklearn.model_selection import train_test_split

import pyconll

from tqdm.notebook import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.metrics import confusion_matrix

import pandas as pd

from numba import jit, njit
import re

import joblib

In [11]:
# take only 10% of hy wikipedia
with open('hy_wiki_10%.txt','w',encoding='utf-8') as f:
    file=open('eastern/dump_wiki_hy.txt','r',encoding='utf-8').readlines()
    normed_txt, x_test= train_test_split(file, test_size=0.9, random_state=42)
    f.writelines(normed_txt)

FileNotFoundError: [Errno 2] No such file or directory: 'eastern/dump_wiki_hy.txt'

In [None]:
# convert UD to txt
my_conll_file_location = 'hy_bsut-ud-train.conllu'
train = pyconll.load_from_file(my_conll_file_location)

with open('ud_eastern.txt', 'w') as f:
    for sentence in train:
        # Do work within loops
        f.write(sentence.text + '\n')

## Stop-words

In [6]:
def preprocess(text):
    # punctuation=['.','-',',','!','?','(','—',')','՞','՛','։','՝','՜','’','«','»','*','\n','=',':','[',']','/',';','․','`','\t','%','$','\xa0','\r','_','●','0','1','2','3','4','5','6','7','8','9']
    punctuation = ['՜', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '-', '—', '։','՞']
    text=text.lower()

    for spaced in punctuation:
        text = text.replace(spaced, '')

    text = re.sub(" +", " ", text)

    txt = text.replace('\n', '')
    txt = txt.split(' ')
    txt = [t for t in txt if t != '']
    return txt

In [4]:
%%time
folders = ['eastern', 'western', 'grabar']
worlds_list_dict = {}

for folder in tqdm(folders):
    files_paths = glob.glob(folder + '/*.txt')
    files = [' '.join(open(path, 'r', encoding='utf8').readlines()) for path in files_paths]

    words_list = []
    for text in files:
        words_list.extend(preprocess(text))

    print(words_list.__len__())
    worlds_list_dict[folder] = Counter(words_list)
    
    with open(f'stop_{folder}.txt', 'w', encoding="utf-8") as f:
        for i, line in enumerate(worlds_list_dict[folder].most_common()):
            if line[1] >= 150:
                f.write(str(line[0]) + '\n')
            else:
                break
                

  0%|          | 0/3 [00:00<?, ?it/s]

8286090
4648167
785606
CPU times: total: 13.2 s
Wall time: 13.2 s


## Load and split data

In [130]:
%%time
folders = ['eastern', 'western', 'grabar']
files_dict = {}

for folder in folders:
    files_paths = glob.glob(folder + '/*.txt')
    files = [' '.join(open(path, 'r', encoding='utf8').readlines()) for path in files_paths]
    files_dict[folder] = files

CPU times: total: 1.05 s
Wall time: 1.03 s


In [135]:
%%time

preprocess_files_dict = {}
batch_len = 20

folders = ['eastern', 'western', 'grabar']
tokens_number = np.array([8286090, 4648167, 785605])

tokens_fraction=np.min(tokens_number)/tokens_number

for i,folder in tqdm(enumerate(folders)):
    files = files_dict[folder]
    batches_list = []

    for file in tqdm(files):
        words = preprocess(file)
        batches = np.array_split(words, np.ceil(words.__len__() / batch_len))
        if i!=2:
            normed_batches, x_test= train_test_split(batches, test_size=1-tokens_fraction[i], random_state=42)
        else:
            normed_batches=batches
        batches_list.extend(normed_batches)
        
    preprocess_files_dict[folder] = batches_list

dataset_text = []
dataset_labels = []
    
for i, folder in enumerate(folders):
    text_batches = preprocess_files_dict[folder]
    dataset_text.extend(text_batches)
    print(len(text_batches))
    dataset_labels.extend(np.full(len(text_batches), i))
    
df=pd.DataFrame({'text':dataset_text, 'labels':dataset_labels})
df.to_csv(f'mwa_mea_grabar_{batch_len}.csv',index=False)


0it [00:00, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

39277
39277
39282
CPU times: total: 35.9 s
Wall time: 37.4 s


## Lexical (stop-words) descriptors

In [136]:
western_stop = ['ենք', 'էի', 'թ', 'ին', 'մենք', 'որոնք', 'պիտի', 'և', 'որպեսզի', 'վրայ', 'կ՚', 'կը', 'մը', 'մըն',
                'անոր', 'ալ', 'ան', 'քեզ', 'եթէ', 'թէ', 'որպէս']

grabar_stop = ['դու', 'եք', 'ըստ', 'նա', 'պիտի', 'վրայ', 'զի', 'ընդ', 'քո', 'քեզ', 'եթէ', 'թէ', 'որպէս']

eastern_stop = ['դու', 'ենք', 'եք', 'էի', 'ըստ', 'ին', 'հետո', 'մենք', 'մեջ', 'նա', 'նաև', 'նրա', 'նրանք', 'որը',
                'որոնք', 'որպես', 'ում', 'վրա', 'և', 'որպեսզի']

western_stop = set(western_stop)
grabar_stop = set(grabar_stop)
eastern_stop = set(eastern_stop)

def get_lexical_desc(words):
    intersect_western = len(set(words) & western_stop) / len(western_stop)
    intersect_grabar = len(set(words) & grabar_stop) / len(grabar_stop)
    intersect_eastern = len(set(words) & eastern_stop) // len(eastern_stop)

    return intersect_western, intersect_grabar, intersect_eastern

## Morphemic descriptors

In [137]:
grabar_suffixes = ['աւք', 'եալ', 'եան', 'իւք', 'ոյց', 'ովք', 'ուց', 'ուցան']
grabar_prefixes = ['ապա', 'արտ', 'բաղ', 'բաղա', 'դեր', 'ենթ', 'ենթա', 'ընթա', ' համ', 'համա', 'հան', 'հոմ', 'հոմա',
                   'տար', 'տարա']

eastern_suffixes = ['աբար', 'ագին', 'ագույն', 'ածո', 'ածու', 'ական', 'ակերտ', 'ային', 'անակ', 'անի', 'անոց', 'անք',
                    'ապան', 'ապանակ', 'ապատ', 'ապես', 'աստան', 'ավետ', 'ավուն', 'արան', 'արար', 'արեն', 'արք', 'ացի',
                    'ացն-', 'ացու', 'բան', 'բար', 'գին', 'գույն', 'եղեն', 'ենի', 'երեն', 'երորդ', 'եցն-', 'լիկ', 'կերտ',
                    'կոտ', 'մունք ', 'յալ', 'յակ', 'յան', 'յանց', 'յուն նախա-', 'ներ', 'նոց', 'ոնք', 'ովին', 'որդ',
                    'որեն', 'ոցի', 'ուք', 'պան', 'պանակ', 'ստան', 'ված', 'վածք', 'ավոր', 'վոր', 'ություն', 'ուլ', 'ուկ',
                    'ուհի', 'ում', 'ույթ', 'ույր', 'ուն', 'ուտ', 'ուրդ', 'ուց']

eastern_prefixes = ['ամենա', 'այսր', 'անդր', 'ապա', 'ավտո', 'արտ', 'արտա', 'բենզա', ', գեր', 'գերա', 'դեր', 'ենթա',
                    'եվրա', ' էլեկտրա', 'թեր', 'թերա', 'կենս', 'կինո', 'հակ', 'հակա', 'համ', 'համա', 'հար', 'հարա',
                    'հեռա', 'հեռուստա', 'հոմա', 'մակ', 'մակրո', 'միկրո', 'միջ', 'նախ', 'ներ', 'ստոր', 'վեր', 'վերա',
                    'տար', 'տարա', 'փոխ', 'քառ', 'քառա']

western_reform = ['իլ', 'իուն', 'եան', 'յ', 'օ', 'է', 'յ', 'վո', 'ոյ', 'եա', 'եօ', 'իւ', 'ու', 'ւ,' 'յե', 'եյ', 'զի',
                  'եւ', 'ել', 'յուն', 'յան', 'ում', 'ո', 'ե', 'հ', 'ո', 'ույ', 'յա', 'յո', 'յու', 'վ', 'ե', ]

morphems=[]
morphems.extend(grabar_suffixes)
morphems.extend(grabar_prefixes)
morphems.extend(eastern_suffixes)
morphems.extend(eastern_prefixes)
morphems.extend(western_reform)

# @jit(nopython=True)
# @njit
def get_morphemic_desc(words,morphems ):
    res=[]
    for morphema in morphems:
        positions = []
        for word in words:
            pos = word.find(morphema)
            if pos != -1:
                positions.append((pos+1)/len(word))
        # positions=np.array(positions, dtype=np.float32)
        if len(positions)==0:

            res.append(0)
        else:
            # std=np.std(positions)
            res.append(np.mean(positions))
            # res.append(sum(positions)/len(positions))

    return res

## Create dataset

In [138]:
dataset_desc_morphemic=[get_morphemic_desc(words,morphems) for words in tqdm(dataset_text)]

  0%|          | 0/117836 [00:00<?, ?it/s]

In [139]:
dataset_desc_lexical=[get_lexical_desc(words) for words in tqdm(dataset_text)]

  0%|          | 0/117836 [00:00<?, ?it/s]

In [142]:
dataset_desc_morphemic=np.array(dataset_desc_morphemic)
dataset_desc_lexical=np.array(dataset_desc_lexical)
dataset_desc_morphemic=np.swapaxes(dataset_desc_morphemic,0,1)

In [143]:
dataset=np.concatenate([dataset_desc_morphemic,dataset_desc_lexical],axis=1)
dataset.shape

(117836, 168)

In [149]:
dataset.shape

(117836, 168)

In [144]:
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset_labels, test_size=0.2, random_state=42)

## Create RTF model

In [145]:
clf = RandomForestClassifier(max_depth=30, random_state=51,verbose=1)
clf.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   17.2s


In [147]:
joblib.dump(clf, "rfc_20_0.935.joblib")

['rfc_20_0.935.joblib']

In [120]:
loaded_rf = joblib.load("rfc_20_0.935.joblib")

In [129]:
loaded_rf.predict_proba([x_test[5]])

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


array([[0.97, 0.03, 0.  ]])

In [146]:
loaded_rf.score(x_test,y_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s


0.9357179226069247

In [118]:
predicted=clf.predict(x_test)
conf_mat = confusion_matrix(y_test, predicted)
print(conf_mat/sum(conf_mat))

[[0.97302697 0.0443686  0.01400778]
 [0.01978022 0.92293875 0.01712062]
 [0.00719281 0.03269265 0.9688716 ]]


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [119]:
clf.feature_importances_

array([3.85156165e-04, 1.22521750e-02, 1.95855626e-02, 1.16515122e-04,
       5.58428702e-04, 4.16945950e-04, 1.29901160e-03, 6.92958449e-04,
       1.61657398e-03, 1.46503604e-03, 3.13636769e-04, 9.30969163e-05,
       5.68056394e-04, 3.13974088e-04, 1.94115784e-04, 4.63171870e-04,
       0.00000000e+00, 3.68465296e-03, 1.90624137e-03, 3.60803302e-05,
       1.44705798e-05, 3.68776233e-03, 8.73370447e-04, 5.28721733e-04,
       2.50605285e-04, 2.93469436e-04, 5.86140814e-04, 5.95447092e-04,
       3.84228841e-02, 2.11506352e-04, 8.80316984e-03, 1.53016510e-03,
       7.58882390e-03, 2.34312016e-04, 2.72877264e-03, 3.01668240e-04,
       7.55720969e-05, 4.37645080e-04, 1.57633905e-04, 8.73035339e-04,
       1.80579657e-05, 7.74734238e-05, 1.42635939e-03, 1.04836610e-03,
       2.07046440e-04, 1.20510112e-03, 1.51162181e-03, 0.00000000e+00,
       1.08352480e-03, 1.62247986e-03, 2.42009723e-03, 7.99467919e-04,
       3.79555983e-04, 1.01005713e-04, 5.34025745e-04, 3.38787733e-04,
      

In [112]:
np.argsort(clf.feature_importances_)

array([167, 147, 100, 101,  47, 106,  57,  66,  16,  61, 121,  97, 105,
       122,  20, 119,  40, 135,  19,  88, 118,  79, 108,  36,  41, 110,
        11,  53, 102,  76,   3, 126,  87,  58, 133,  38, 132,  94,  14,
        44,  72,  29, 104,  33,  24, 116,  59,  65,  95, 112,  25,  35,
        70,  10,  13,  55,  60,  73,  52, 117,   0,   5, 144,  63,  37,
       111,  15,  85,  23,  54,   4,  83,  12, 103,  26,  27, 109, 107,
        84,   7, 131,  99,  91, 148,  90,  51,  62, 124,  39,  22, 130,
       123,  75, 120,  43,  48,  45,  74,  92,  77,   6,  98,  42, 115,
         9,  46,  31,   8,  49,  96,  18,  56,  80, 127,  69,  50,  68,
        71,  34, 129, 134, 159,  17,  21, 113, 114, 128, 149,  81, 157,
       146, 155, 158, 137,  32, 140,  30, 151,  78, 156, 164, 141,  93,
       163,  89, 142,   1,  64, 139, 153,  82,   2, 136, 152, 138, 160,
       162, 161,  86, 166, 145, 165, 154,  28, 143,  67, 125, 150],
      dtype=int64)