In [None]:
import numpy as np
import os
from matplotlib import pyplot as plt
import glob
from collections import Counter
from sklearn.linear_model import LinearRegression
from lmfit.models import Model
import re

import pyconll


## Stop-words

In [None]:
def preprocess(sentence):

    # punctuation=['.','-',',','!','?','(','—',')','՞','՛','։','՝','՜','’','«','»','*','\n','=',':','[',']','/',';','․','`','\t','%','$','\xa0','\r','_','●','0','1','2','3','4','5','6','7','8','9']
    punctuation=['՜',',','.','0','1','2','3','4','5','6','7','8','9',':','-','—','։']

    for spaced in punctuation:
        sentence = sentence.replace(spaced, '').lower()

    sentence=re.sub(" +", " ", sentence)

    txt=sentence.replace('\n','').lower()
    txt=txt.split(' ')
    return txt

In [None]:
my_conll_file_location = 'western/hyw_armtdp-ud-train.conllu'
train = pyconll.load_from_file(my_conll_file_location)

In [None]:
with open('ud_western.txt','w') as f:

    for sentence in train:
        for token in sentence:
            # Do work within loops
            f.write(token.form+'\n')

In [None]:
%%time

# folders=['western']
folders=['eastern']
worlds_list_dict={}


for folder in folders:
    files_paths=glob.glob(folder+'/*.txt')
    names=[path.replace('/',' ')[:15] for path in files_paths]
    files=[' '.join(open(path,'r',encoding='utf8').readlines()) for path in files_paths]

    words_list=[]
    for sentence in files:
        words_list.extend(preprocess(sentence))

    print(words_list.__len__())
    worlds_list_dict[folder]=Counter(words_list)


In [None]:
# worlds_list_dict['western'].most_common()
worlds_list_dict['eastern'].most_common()

with open('stop_eastern.txt','w') as f:
    for i,line in enumerate(worlds_list_dict['eastern'].most_common()):
        if line[1]>=150:
            f.write(str(line[0])+'\n')
        else:
            break

In [None]:
worlds_list_dict['western'].most_common()

In [None]:
line[1]

In [None]:
west_folder='western/'
east_folder='eastern/'
grabar_folder='grabar/'


west_files_paths=glob.glob(west_folder+'/*.txt')
east_files_paths=glob.glob(east_folder+'/*.txt')
grabar_files_paths=glob.glob(grabar_folder+'/*.txt')

names=[path.replace('/',' ')[:15] for path in west_files_paths]
names.extend([path.replace('/',' ')[:15] for path in east_files_paths])
names.extend([path.replace('/',' ')[:15] for path in grabar_files_paths])

west_files=[' '.join(open(path,'r',encoding='utf8').readlines()) for path in west_files_paths]
east_files=[' '.join(open(path,'r',encoding='utf8').readlines()) for path in east_files_paths]
grabar_files=[' '.join(open(path,'r',encoding='utf8').readlines()) for path in grabar_files_paths]

words_count=[len(preprocess(sentence)) for sentence in west_files]
words_count.extend([len(preprocess(sentence)) for sentence in east_files])
words_count.extend([len(preprocess(sentence)) for sentence in grabar_files])

words_list=[preprocess(sentence) for sentence in west_files]
words_list.extend([preprocess(sentence) for sentence in east_files])
words_list.extend([preprocess(sentence) for sentence in grabar_files])

# west_files

In [None]:
def lin_regr_approx(x, y):
    """
    :param x: list (n,)
    :param y: list (n,)
    :return: (x_pred, y_pred), k, b, angle, score
    """
    #
    # аппроксимация распределения линейной функцией и создание графика по параметрам распределения
    #

    x=np.array(x)
    if len(x)==0:
        x=np.array([0])

    y=np.array(y).reshape((-1,1))
    x=np.array(x).reshape((-1,1))

    x_pred = np.linspace(x.min(axis=0), x.max(axis=0), 50)

    reg = LinearRegression().fit(x, y)
    y_pred = reg.predict(x_pred)

    k = reg.coef_[0][0]
    b = reg.predict([[0]])[0][0]

    angle = np.rad2deg(np.arctan(k))
    score = reg.score(x, y)

    return (x_pred, y_pred), k, b, angle, score

In [None]:
def gaussian(x, mu, sigma, amp=1):
    """
    :param x: list (n,)
    :param mu: float
    :param sigma: float
    :param amp: float
    :return: list (n,)
    """
    #
    # возвращает нормальную фунцию по заданным параметрам
    #
    return np.array((amp / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-(x - mu) ** 2 / (2 * sigma ** 2)))

In [None]:
def gaussian_fit(x, y, mu=5, sigma=1, amp=1):
    """
    :param x: list (n,)
    :param y: list (n,)
    :param mu: float
    :param sigma: float
    :param amp: float
    :return: mus, sigmas, amps
    """
    #
    # аппроксимация заданных точек нормальной функцией
    #
    gmodel = Model(gaussian)
    res = gmodel.fit(y, x=x, mu=mu, sigma=sigma, amp=amp)

    mu = res.params['mu'].value
    sigma = res.params['sigma'].value
    amp = res.params['amp'].value

    return mu, sigma, amp

In [None]:
def find_positions(words,find_values):
    positions=[]
    for word in words:
        for value in find_values:
            pos=word.find(value)
            if pos!=-1:
                # positions.append((pos+1)/len(word))
                positions.append((pos+1))
    return positions

In [None]:
def print_pos(positions,find_value,log=False,linear=False,min_probab=0.01, gauss=False):
    fig,axes=plt.subplots(3,3,figsize=(20,12))

    step=0
    for i,axes_list in enumerate(axes):
        for j,ax in enumerate(axes_list):
            if step!=9:
                pos=np.array(positions[step])
                if len(pos)==0:
                    pos=np.array([0])

                low, high = np.floor(pos.min()), np.ceil(pos.max())
                # print(step,low)
                # print(step,high)
                # low=int(low)
                # high=int(high)

                bins = np.linspace(low, high, int(high - low) + 1)
                # print(step,bins)
                hist, edges = np.histogram(pos, bins=bins, density=True)
                # print(step,hist)
                # print(step,pos)
                edges=edges[:-1]

                max_chars=len(hist)
                for k,h in enumerate(hist):
                    if h<min_probab:
                        max_chars=k
                        break

                hist=hist[:max_chars]
                # print(step,hist,'\n---')
                edges=edges[:max_chars]


                for i in range(len(hist))[1:-1]:
                    if hist[i]==0:
                        hist[i]=np.mean([hist[i-1],hist[i+1]])

            # mean=np.mean(hist)
                # mean=np.log(mean)
                sum_scatter=sum(hist)
                legend=f'all words={len(pos)}\n words count={words_count[step]}\nscatter sum={round(sum_scatter,2)}'
                if log:
                    hist=np.log(hist)

                if linear:
                    (x_pred, y_pred), k, b, angle, score=lin_regr_approx(edges,hist)
                    ax.plot(x_pred,y_pred)
                    legend+=f'\nk={round(k,2)}\nscore={round(score,2)}'

                if gauss:
                    if len(edges)>0:
                        # print('edges',edges)
                        # print('hist',hist)
                        # print('----')
                        mu, sigma, amp=gaussian_fit(edges, hist)
                        x_gauss=np.linspace(1,max(edges))
                        y_gauss=gaussian(x_gauss, mu=mu, sigma=sigma, amp=amp)
                        ax.plot(x_gauss,y_gauss)
                        legend+=f'\nmu={round(mu,2)}\nsigma={round(sigma,2)}\namp={round(amp,2)}'

                if len(edges)==0:
                    edges=[0]
                    hist=[0]

                ax.plot(edges,hist)
                # ax.axhline(y=mean, color='r', linestyle='-')
                ax.scatter(edges,hist,c='green')
                # ax.set_xticks(np.arange(1,max(edges)+1,1))

                # ax.set_yticks(np.linspace(0,max(hist),5))
                ax.set_title(str(step)+' '+find_value[0]+' '+names[step])

                ax.legend([legend])
                ax.set_ylabel('p (x) ')
                ax.set_xlabel('char position in word')
                step+=1

    fig.subplots_adjust(wspace=0.15,hspace=0.45)
    # plt.rc('axes', unicode_minus=False)
    plt.show()

In [None]:
find_value=['է']
# log=True
log=False
# linear=True
linear=False
# gauss=True
gauss=False
min_probab=0.000001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab,gauss=gauss)

In [None]:
find_value=['ել','եւ']
# log=True
log=False
# linear=True
linear=False
gauss=True
# gauss=False
min_probab=0.000001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab,gauss=gauss)

In [None]:
find_value=['օ','o']
# log=True
log=False
linear=True
min_probab=0.0001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab)

In [None]:
find_value=['եան']
log=True
# log=False
linear=True
min_probab=0.001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab)

In [None]:
find_value=['իլ','իւ']
# log=True
log=False
# linear=True
linear=False
# gauss=True
gauss=False
min_probab=0.000001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab,gauss=gauss)

In [None]:
find_value=['ել','եւ']
log=True
# log=False
linear=True
min_probab=0.00001

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log,linear=linear,min_probab=min_probab)

In [None]:
find_value=['կը']
log=False

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log)

In [None]:
find_value=['մը']
log=False

positions=[find_positions(words,find_value) for words in words_list]

print_pos(positions,find_value,log=log)

In [None]:
find_value='ել'

positions=[find_positions(words,find_value) for words in words_list]

fig,axes=plt.subplots(2,2,figsize=(7,7))
names=[f'west_pos, {west_files_paths[0][8:18]}',f'west_pos, {west_files_paths[1][8:18]}',
       f'east_pos, {east_files_paths[0][8:18]}',f'east_pos, {east_files_paths[1][8:18]}']

step=0
bins=10
for i,axes_list in enumerate(axes):
    for j,ax in enumerate(axes_list):
        # ax.hist(positions[step],bins=bins,density=True)
        norm=sum(positions[step])
        y, x = np.histogram(positions[step], bins=np.linspace(0,1,bins))
        x=x[:-1]
        y=y/norm
        # y=y/y_il_list[step]
        y=y_il_list[step]/y
        ax.plot(x,y)
        ax.scatter(x,y,c='green')
        ax.set_title(names[step])
        step+=1


plt.show()