In [368]:
import requests
import json
import codecs
import pandas as pd
from pandasql import sqldf
import csv
import numpy as np

In [322]:
class Texter:
    def __init__(self, text):
        self.left = text[:text.find("<")]
        self.right = text[text.find(">")+1:]
        self.match = text[text.find("<")+1:text.find(">")]

In [323]:
def findincorpus(link, query):
    queryfull = {"query":query,"corpus":"main","meta":"","init":False,"contextSize":1}
    res = requests.post(link, data = json.dumps(queryfull))
    res  = res.text
    return res

In [324]:
def parseCols(item):
    ans = {}
    while(True):
        ans[item[1:item.find(" ")]] = item[item.find(" ") + 1:item.find(">")]
        item = item[item.find(">") + 1:]
        if(len(item) == 0):
            return ans

In [325]:
def parseRes(lines):
    res = []
    for line in lines:
        num = line[:line.find(":")].split()[0]
        text = Texter(line[line.find(">:") + 4:])
        colsAndValues = line[line.find("<"):line.find(">:") + 1]
        cmap = parseCols(colsAndValues)
        cmap['num'] = num
        cmap['Left context'] = text.left
        cmap['Match'] = text.match
        cmap['Right context'] = text.right
        cmap['Context'] = text.left + text.match + text.right
        res.append(cmap)
    return res

In [326]:
def readContext(data):
    res = []
    for item in data:
        symbol = item.find('/')
        string = ''
        while (symbol != -1):
            spaceIndex = item[symbol:].find(' ')
            if spaceIndex == -1:
                break
            string += item[:symbol]
            item = item[symbol+spaceIndex:]
            symbol = item.find('/')
        res.append(string)
    return res

In [327]:
def getAccentDict():
    a = []
    with open('accent.csv', newline='\n', encoding = 'utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if row[0] not in ("м", "ж", "мо", "м//мо", "мо//м"):
                help_str = ""
                for s in row:
                    help_str += str(s)
                a.append(help_str)
    d = dict()
    for i in a:
        if i[0] not in ("1", "2", "3", "4", "5", "6") and len(i)>1:
            key = i
        else:
            value = ''
            for s in i:
                if s in ('c', 'd', 'e'):
                    value +=s
            if d.get(key) != None:
                d.update({key: d.get(key) + value})
            else:
                d.update({key: value})
            
            
    return d

In [328]:
def cleanTable(data):
    lemas = []
    anim = []
    gender = []
    finale = []
    nsyll = []
    prep = []
    context = []
    i = 0
    for cont in data['Left context']:
        c = str(cont).split()
        p = ''
        for w in c[:-5:-1]:
            if 'PREP' in w:
                p = w[0:w.find("/")].lower()
                break
        if p in ['на', 'в', 'во', 'о', 'об', 'при']:
            prep.append(p)
        else:
            data = data.drop([i])
        i += 1
    for word in data['Match']:
        word = str(word)
        tags = word[word.find("/")+1:].split(',')
        lemas.append(word[word.rfind("/")+1:]) #лемма
        anim.append(tags[1]) #одушевленность
        gender.append(tags[2]) #род
        if lemas[-1][-1] not in 'уеыаоэяиюёь':
            finale.append(lemas[-1][-1])
        elif lemas[-1][-1] in 'уыаоэ':
            finale.append(lemas[-1][-2])
        elif lemas[-1][-2] in 'уеыаоэяиюёь':
            finale.append('й')
        else:
            finale.append(lemas[-1][-2]+"'") # исход основы

        n = 0
        for letter in word[word.rfind("/")+1:-1]:
            if letter in 'уеыаоэяиюё':
                n += 1
        nsyll.append(n)# количество слогов в основе
        words = []
    for item in data['Match']:
        words.append(item[:item.find('/')])
    context = readContext(data['Context'])
    return pd.DataFrame({'word': words,
                       'lema': lemas,
                     'animacy': anim,
                       'gender': gender,
                       'finale': finale,
                       'n_syll': nsyll,
                       'preposition': prep,
                        'context': context
                      })

In [364]:
def isFinaleSoft(table):
    a = np.array(table['finale'].values.tolist())
    soft = []
    for i in a:
        if "'" in i:
            soft.append(0)
        else:
            soft.append(1)
    table['soft0'] = soft
    return table

In [329]:
link = 'http://lingconlab.ru/khislavichi/backend/get_results.php'
query_y = "[(word='.*(у|ю)'%c)& (tag = 'NOUN,(anim|inan),(masc|neut),sing,(loct|loc2|datv).*'%c)]"
query_e = "[(word='.*е'%c)& (tag = 'NOUN,(anim|inan),(masc|neut),sing,loct.*'%c)]"

In [330]:
lines_y = findincorpus(link, query_y)
lines_e = findincorpus(link, query_e)

In [331]:
data_y = cleanTable(pd.DataFrame(parseRes(lines_y.splitlines())))
data_e = cleanTable(pd.DataFrame(parseRes(lines_e.splitlines())))

In [332]:
data_e.to_csv('khislavichi_data_e.csv')
data_y.to_csv('khislavichi_data_y.csv')

In [333]:
data_e = pd.read_csv ("khislavichi_data_e_clean.csv", sep = ';')
data_y = pd.read_csv ("khislavichi_data_y_clean.csv", sep = ';')

In [334]:
d = getAccentDict()
lemas = data_e['lema'].tolist()
accent = []
for lema in lemas:
    if lema in d.keys():
        accent.append(1)
    else:
        accent.append(0)
data_e['accent'] = accent

lemas = data_y['lema'].tolist()
accent = []
for lema in lemas:
    if lema in d.keys():
        accent.append(1)
    else:
        accent.append('0')
data_y['accent'] = accent

In [335]:
qwe = data_e.value_counts('lema').to_frame().join(data_y.value_counts('lema').to_frame(), on='lema', how='outer', lsuffix='e_num', rsuffix='y_num')

In [336]:
qwe['0e_num'] = qwe['0e_num'].fillna(0)
qwe['0y_num'] = qwe['0y_num'].fillna(0)

In [337]:
qwe['perc'] = qwe['0y_num']/(qwe['0e_num']+qwe['0y_num'])

In [338]:
feat_e = data_e[['lema', 'animacy', 'gender', 'finale', 'n_syll', 'accent']]
feat_y = data_y[['lema', 'animacy', 'gender', 'finale', 'n_syll', 'accent']]

In [365]:
feat_e.loc[feat_e['animacy']=='inan', 'animacy'] = 1
feat_e.loc[feat_e['animacy']=='anim', 'animacy'] = 0
feat_y.loc[feat_y['animacy']=='inan', 'animacy'] = 1
feat_y.loc[feat_y['animacy']=='anim', 'animacy'] = 0

feat_e.loc[feat_e['gender']=='masc', 'gender'] = 1
feat_e.loc[feat_e['gender']=='neut', 'gender'] = 0
feat_y.loc[feat_y['gender']=='masc', 'gender'] = 1
feat_y.loc[feat_y['gender']=='neut', 'gender'] = 0

feat_e = isFinaleSoft(feat_e)
feat_y = isFinaleSoft(feat_y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table['soft0'] = soft


In [367]:
feat_y.to_csv('khislavichi_feat_y.csv')
feat_e.to_csv('khislavichi_feat_e.csv')

In [339]:
sqldf("select count(word) from data_e where data_e.animacy == \'inan\' and data_e.gender == \'masc\' and data_e.n_syll == 1 and data_e.accent == 1")

Unnamed: 0,count(word)
0,78


In [340]:
sqldf("select count(word) from data_y where data_y.animacy == \'inan\' and data_y.gender == \'masc\' and data_y.n_syll == 1 and data_y.accent == 1")

Unnamed: 0,count(word)
0,132


In [341]:
132/(132+78)

0.6285714285714286

In [342]:
pd.set_option('display.max_colwidth', None)

In [343]:
data_y['preposition'].value_counts()

в     146
на     41
во      4
Name: preposition, dtype: int64

In [371]:
data_e['preposition'].value_counts()

в      453
на     178
во      25
при     12
Name: preposition, dtype: int64

In [344]:
print(data_y[data_y['gender']=='neut'][['lema', 'context']])

         lema  \
85  канаршино   

                                                                    context  
85  Там же сейчас , говорили , два дворы тоже осталось , в Канаршину ходили  


In [345]:
print(data_y[data_y['animacy']=='anim'][['lema', 'context']])

      lema  \
45   кабан   
108    бык   

                                                                             context  
45   Ну , к = чтоб кабан этот ( этый ) , он ( ён ) ж на сало , сало на кабану растёт  
108                                      Двенадцать лет . Я на быку бороновала землю  


In [346]:
print(len(data_y[data_y['n_syll']>1]['lema']))

33


In [347]:
feat = pd.concat([feat_e.drop_duplicates(),feat_y.drop_duplicates()], axis=0)

In [348]:
final_table = sqldf("select qwe.*, feat.animacy, feat.gender, feat.finale, feat.n_syll from qwe join feat on qwe.lema = feat.lema")

In [349]:
import numpy as np
a = np.array(final_table['finale'].values.tolist())
soft = []
for i in a:
    if "'" in i:
        soft.append(0)
    else:
        soft.append(1)
final_table['soft0'] = soft
final_table = final_table.drop_duplicates()

In [350]:
final_table[final_table['lema'] == 'лес']

Unnamed: 0,lema,0e_num,0y_num,perc,animacy,gender,finale,n_syll,soft0
188,лес,0.0,29.0,1.0,inan,masc,с,1,1


In [351]:
final_table.loc[final_table['animacy']=='inan', 'animacy'] = 1

In [352]:
final_table.loc[final_table['animacy']=='anim', 'animacy'] = 0

In [353]:
final_table.loc[final_table['gender']=='masc', 'gender'] = 1
final_table.loc[final_table['gender']=='neut', 'gender'] = 0

In [354]:
final_table[final_table['lema']=='рот']

Unnamed: 0,lema,0e_num,0y_num,perc,animacy,gender,finale,n_syll,soft0
73,рот,2.0,1.0,0.333333,1,1,т,1,1


In [355]:
final_table.to_csv('khislavichi_final_table.csv')

http://lingconlab.ru/malinino/?#!/
http://lingconlab.ru/vaduga/?#!/
http://lingconlab.ru/opochka/?#!/
http://lingconlab.ru/lukhteza/?#!/
http://lingconlab.ru/nekhochi/?#!/
ustja [(word = ".+(у|ю)") & (tag = "N.(m|n)s(d|l).")]

In [356]:
def print_summary(num):
    sum_data = {'counter' : final_table[num],'animacy' : final_table[num] * final_table['animacy'], 'soft' : final_table['soft0'] * final_table[num], 'gender' : final_table[num] * final_table['gender'], 'n_syll' : final_table[num] * final_table['n_syll']}
    sum = pd.DataFrame(sum_data)
    sum = sum[sum_data['counter'] > 0]
    animcay_perc = sum['animacy'].sum() / sum['counter'].sum()
    gender_perc = sum['gender'].sum() / sum['counter'].sum()
    n_syll_mean = sum['n_syll'].sum() / sum['counter'].sum()
    soft = sum['soft'].sum() / sum['counter'].sum()
    forms_perc = final_table[num].sum() / (final_table['0e_num'].sum() + final_table['0y_num'].sum())
    forms = final_table[num].sum()
    print("animacy (inan): ", animcay_perc)
    print("gender (masc): ", gender_perc)
    print("n_syll: ", n_syll_mean)
    print("forms: {0}, forms_perc: {1}".format(forms, forms_perc))
    print("soft: ", soft)

In [357]:
print_summary('0e_num')

animacy (inan):  0.9671150971599403
gender (masc):  0.796711509715994
n_syll:  1.7608370702541105
forms: 669.0, forms_perc: 0.7770034843205574
soft:  0.8565022421524664


In [358]:
print_summary('0y_num')

animacy (inan):  0.984375
gender (masc):  0.9947916666666666
n_syll:  1.2395833333333333
forms: 192.0, forms_perc: 0.2229965156794425
soft:  0.9947916666666666


In [359]:
a = 'лоб, ров, лёд, рот, горб, пруд, полк, сук, скит, плот, пост, штифт, кол, торг, ад, чад, бред, под, рай, мел, мол, пыл, плен, жар, мир, яр, цвет, рант, быт, вольт, пах, пух, плац, хмель, ток, порт, мозг, долг, луг, круг, зад, сад, след, год, род, баз, глаз, паз, низ, воз, бой, строй, бок, ток, бал, вал, пол, тыл, кон, пар, бор, лес, нос, пот, борт, форт, мост, шкаф, верх, хлев, зоб, гроб, дуб, шаг, снег, лог, стог, мёд, ход, ряд, таз, пай, край, чай, слой, рой, шёлк, корм, дым, чан, суп, жир, пир, хор, смотр, сыр, час, грунт, спирт, мех, цех, газ, клей, гной, сок, стан, мыс, свет, тиф, шлях, мох крюк, перёд, холод, отпуск, терем, ветер, уголок, мысок, забытьё, берег, повод, бережок, бочок, угол, аэропорт, бег, век, вес, вид, ворот, день, дом, дух, корень, лад, лёт, плав, род, скак, счёт, ход, юр'
dicts = a.split(sep = ', ')
words = final_table[final_table['0y_num'] > 0]['lema']
diff = set(words).difference(set(dicts))
diff

{'банк',
 'большак',
 'бугор',
 'бык',
 'бычок',
 'взгорок',
 'горлач',
 'гребень',
 'двор',
 'кабан',
 'канаршино',
 'куст',
 'кут',
 'малинник',
 'мешок',
 'особняк',
 'плуг',
 'полик',
 'полушубок',
 'посёлок',
 'потолок',
 'свинарник',
 'собняк',
 'станок',
 'столб',
 'сундук',
 'танк',
 'язык'}

In [360]:
words_e = final_table[final_table['0e_num'] > 0]['lema']
diffe = set(words_e).intersection(set(dicts))
diffe

{'вид',
 'день',
 'дом',
 'дух',
 'круг',
 'лоб',
 'повод',
 'рот',
 'суп',
 'угол',
 'уголок'}

In [361]:
len(diff)

28

In [369]:
lemas_y = final_table[final_table['0y_num'] > 0]['lema']
len(set(lemas_y))

50

In [370]:
both = final_table[final_table['0y_num'] * final_table['0e_num'] > 0]['lema']
only_y = (set(lemas_y)).difference(both)
lemas_e = final_table[final_table['0e_num'] > 0]['lema']
only_e = (set(lemas_e)).difference(both)
print('only -e: ', len(only_e))
print('only -y: ', len(only_y))
print('both -y and -e: ', len(both))

only -e:  159
only -y:  36
both -y and -e:  15


In [363]:
map16 = {'counter': final_table['0y_num'] * final_table['animacy'] * final_table['gender'] * (final_table['n_syll'] > 1), 'lema': final_table['lema']}
map16 = pd.DataFrame(map16)
map16.loc[map16['counter'] == False, 'counter'] = 0
map16.loc[map16['counter'] == True, 'counter'] = 1
map16[map16['counter'] == 1]
map16.to_csv('map16.csv')