In [None]:
import numpy as np
import os
from matplotlib import pyplot as plt
import glob
from collections import Counter
from sklearn.linear_model import LinearRegression
# from lmfit.models import Model
import re
from sklearn.model_selection import train_test_split

import pyconll

from tqdm.notebook import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.metrics import confusion_matrix

import pandas as pd

In [None]:
with open('hy_wiki_10%.txt','w',encoding='utf-8') as f:
    file=open('eastern/dump_wiki_hy.txt','r',encoding='utf-8').readlines()
    normed_txt, x_test= train_test_split(file, test_size=0.9, random_state=42)
    f.writelines(normed_txt)

## UD to txt

In [None]:
my_conll_file_location = 'hy_bsut-ud-train.conllu'
train = pyconll.load_from_file(my_conll_file_location)

In [None]:
with open('ud_eastern.txt', 'w') as f:
    for sentence in train:
        # Do work within loops
        f.write(sentence.text + '\n')

## Stop-words

In [None]:
def preprocess(sentence):
    # punctuation=['.','-',',','!','?','(','—',')','՞','՛','։','՝','՜','’','«','»','*','\n','=',':','[',']','/',';','․','`','\t','%','$','\xa0','\r','_','●','0','1','2','3','4','5','6','7','8','9']
    punctuation = ['՜', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '-', '—', '։']

    for spaced in punctuation:
        sentence = sentence.replace(spaced, '').lower()

    sentence = re.sub(" +", " ", sentence)

    txt = sentence.replace('\n', '').lower()
    txt = txt.split(' ')
    txt = [t for t in txt if t != '']
    return txt

In [None]:
%%time
folders = ['eastern', 'western', 'grabar']
worlds_list_dict = {}

for folder in folders:
    files_paths = glob.glob(folder + '/*.txt')
    names = [path.replace('/', ' ')[:15] for path in files_paths]
    files = [' '.join(open(path, 'r', encoding='utf8').readlines()) for path in files_paths]

    words_list = []
    for sentence in files:
        words_list.extend(preprocess(sentence))

    print(words_list.__len__())
    worlds_list_dict[folder] = Counter(words_list)


In [None]:
with open('stop_eastern.txt', 'w') as f:
    for i, line in enumerate(worlds_list_dict['eastern'].most_common()):
        if line[1] >= 150:
            f.write(str(line[0]) + '\n')
        else:
            break

## Load and split data

In [None]:
%%time
folders = ['eastern', 'western', 'grabar']
files_dict = {}

for folder in folders:
    files_paths = glob.glob(folder + '/*.txt')
    files = [' '.join(open(path, 'r', encoding='utf8').readlines()) for path in files_paths]
    files_dict[folder] = files

In [None]:
files[0].__len__()

In [None]:
%%time

preprocess_files_dict = {}
batch_len = 30

folders = ['eastern', 'western', 'grabar']
tokens_number = np.array([8286090, 4648167, 785605])

tokens_fraction=785606/tokens_number

for i,folder in enumerate(folders):
    files = files_dict[folder]
    batches_list = []

    for file in tqdm(files):
        words = preprocess(file)
        batches = np.array_split(words, np.ceil(words.__len__() / batch_len))
        if i!=2:
            normed_batches, x_test= train_test_split(batches, test_size=1-tokens_fraction[i], random_state=42)
        else:
            normed_batches=batches
        batches_list.extend(normed_batches)

    preprocess_files_dict[folder] = batches_list


In [None]:
dataset_text = []
dataset_labels = []

for i, folder in enumerate(folders):
    text_batches = preprocess_files_dict[folder]
    dataset_text.extend(text_batches)
    print(len(text_batches))
    dataset_labels.extend(np.full(len(text_batches), i))

In [None]:
df=pd.DataFrame({'text':dataset_text, 'labels':dataset_labels})

In [None]:
df.to_csv('mwa_mea_grabar_30.csv',index=False)

## Lexical (stop-words) descriptors

In [None]:
western_stop = ['ենք', 'էի', 'թ', 'ին', 'մենք', 'որոնք', 'պիտի', 'և', 'որպեսզի', 'վրայ', 'կ՚', 'կը', 'մը', 'մըն',
                'անոր', 'ալ', 'ան', 'քեզ', 'եթէ', 'թէ', 'որպէս']

grabar_stop = ['դու', 'եք', 'ըստ', 'նա', 'պիտի', 'վրայ', 'զի', 'ընդ', 'քո', 'քեզ', 'եթէ', 'թէ', 'որպէս']

eastern_stop = ['դու', 'ենք', 'եք', 'էի', 'ըստ', 'ին', 'հետո', 'մենք', 'մեջ', 'նա', 'նաև', 'նրա', 'նրանք', 'որը',
                'որոնք', 'որպես', 'ում', 'վրա', 'և', 'որպեսզի']

western_stop = set(western_stop)
grabar_stop = set(grabar_stop)
eastern_stop = set(eastern_stop)

In [None]:
def get_lexical_desc(words):
    intersect_western = len(set(words) & western_stop) / len(western_stop)
    intersect_grabar = len(set(words) & grabar_stop) / len(grabar_stop)
    intersect_eastern = len(set(words) & eastern_stop) // len(eastern_stop)

    return intersect_western, intersect_grabar, intersect_eastern

## Morphemic descriptors

In [None]:
grabar_suffixes = ['աւք', 'եալ', 'եան', 'իւք', 'ոյց', 'ովք', 'ուց', 'ուցան']
grabar_prefixes = ['ապա', 'արտ', 'բաղ', 'բաղա', 'դեր', 'ենթ', 'ենթա', 'ընթա', ' համ', 'համա', 'հան', 'հոմ', 'հոմա',
                   'տար', 'տարա']

eastern_suffixes = ['աբար', 'ագին', 'ագույն', 'ածո', 'ածու', 'ական', 'ակերտ', 'ային', 'անակ', 'անի', 'անոց', 'անք',
                    'ապան', 'ապանակ', 'ապատ', 'ապես', 'աստան', 'ավետ', 'ավուն', 'արան', 'արար', 'արեն', 'արք', 'ացի',
                    'ացն-', 'ացու', 'բան', 'բար', 'գին', 'գույն', 'եղեն', 'ենի', 'երեն', 'երորդ', 'եցն-', 'լիկ', 'կերտ',
                    'կոտ', 'մունք ', 'յալ', 'յակ', 'յան', 'յանց', 'յուն նախա-', 'ներ', 'նոց', 'ոնք', 'ովին', 'որդ',
                    'որեն', 'ոցի', 'ուք', 'պան', 'պանակ', 'ստան', 'ված', 'վածք', 'ավոր', 'վոր', 'ություն', 'ուլ', 'ուկ',
                    'ուհի', 'ում', 'ույթ', 'ույր', 'ուն', 'ուտ', 'ուրդ', 'ուց']
eastern_prefixes = ['ամենա', 'այսր', 'անդր', 'ապա', 'ավտո', 'արտ', 'արտա', 'բենզա', ', գեր', 'գերա', 'դեր', 'ենթա',
                    'եվրա', ' էլեկտրա', 'թեր', 'թերա', 'կենս', 'կինո', 'հակ', 'հակա', 'համ', 'համա', 'հար', 'հարա',
                    'հեռա', 'հեռուստա', 'հոմա', 'մակ', 'մակրո', 'միկրո', 'միջ', 'նախ', 'ներ', 'ստոր', 'վեր', 'վերա',
                    'տար', 'տարա', 'փոխ', 'քառ', 'քառա']

western_reform = ['իլ', 'իուն', 'եան', 'յ', 'օ', 'է', 'յ', 'վո', 'ոյ', 'եա', 'եօ', 'իւ', 'ու', 'ւ,' 'յե', 'եյ', 'զի',
                  'եւ', 'ել', 'յուն', 'յան', 'ում', 'ո', 'ե', 'հ', 'ո', 'ույ', 'յա', 'յո', 'յու', 'վ', 'ե', ]

morphems=[]
morphems.extend(grabar_suffixes)
morphems.extend(grabar_prefixes)
morphems.extend(eastern_suffixes)
morphems.extend(eastern_prefixes)
morphems.extend(western_reform)


In [None]:
def get_morphemic_desc(words, morphema):

    positions = []
    for word in words:
        pos = word.find(morphema)
        if pos != -1:
            positions.append((pos+1)/len(word))

    if positions.__len__()==0:
        positions=[0,0,0]

    std=np.std(positions)
    # mean=np.mean(positions)

    return std
    # return mean

## Create dataset

In [None]:
dataset_desc_lexical=[get_lexical_desc(text) for text in tqdm(dataset_text)]

dataset_desc_morphemic=[]
for i,morph in enumerate(tqdm(morphems)):
    dataset_desc_morphemic.append([])
    dataset_desc_morphemic[i]=[get_morphemic_desc(text,morph) for text in dataset_text]

In [None]:
dataset_desc_morphemic=np.array(dataset_desc_morphemic)
dataset_desc_lexical=np.array(dataset_desc_lexical)
dataset_desc_morphemic=np.swapaxes(dataset_desc_morphemic,0,1)

In [None]:
dataset=np.concatenate([dataset_desc_morphemic,dataset_desc_lexical],axis=1)
dataset.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset_labels, test_size=0.2, random_state=42)

## Create RTF model

In [None]:
clf = RandomForestClassifier(max_depth=15, random_state=51,verbose=1)
clf.fit(x_train, y_train)

In [None]:
clf.score(x_test,y_test)

In [None]:
predicted=clf.predict(x_test)
conf_mat = confusion_matrix(y_test, predicted)
print(conf_mat/sum(conf_mat))

In [None]:
clf.feature_importances_