In [1]:
import glob
import os
import pandas as pd
import re
import string
from pymystem3 import Mystem
import regex
from tqdm import tqdm_notebook as tqdm
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
from IPython.display import display, Markdown
pd.options.mode.chained_assignment = None


In [2]:
root_path = './data/randombooks/'
files = os.listdir(root_path)
texts = []
files_names = []

i = 0
for file in files:
    i += 1
    path_to_file = root_path + file
    try:
        with open(path_to_file, 'r') as book:
            text = book.read().replace('\n', '')
            texts.append(text)
            files_names.append(file)
    except:
        # try cp1251, if not, just coerce errors. so, need additional consideration
        with open(path_to_file, 'r', encoding = 'cp1251', errors = 'coerce') as book:
            print('check encoding for index ' + str(i - 1))
            text = book.read()
            texts.append(text)
            files_names.append(file)
    if i % 100 == 0:
        print(i + ' files are read!')

check encoding for index 3
check encoding for index 4
check encoding for index 29


In [3]:
texts_df = pd.DataFrame({'texts': texts, 'files_names': files_names})

In [4]:
# parse idioms from wiki
wiki_dfs = pd.read_html('https://ru.wiktionary.org/wiki/%D0%9F%D1%80%D0%B8%D0%BB%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D0%B5:%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D1%84%D1%80%D0%B0%D0%B7%D0%B5%D0%BE%D0%BB%D0%BE%D0%B3%D0%B8%D0%B7%D0%BC%D0%BE%D0%B2_%D1%80%D1%83%D1%81%D1%81%D0%BA%D0%BE%D0%B3%D0%BE_%D1%8F%D0%B7%D1%8B%D0%BA%D0%B0', header=0)
wiki_idioms = []
for i in range(1, len(wiki_dfs)):
    wiki_idioms.append(wiki_dfs[i][['Фразеологизм']])
wiki_idioms = pd.concat(wiki_idioms)
wiki_idioms.columns = ['idioms']

In [156]:
# read the list of russian idioms from file
idioms_dict = pd.read_csv('./data/idioms_dict.csv')
idioms_dict = idioms_dict.append(wiki_idioms, sort = False)
idioms_dict['idioms'] = idioms_dict.idioms.str.lower()

In [6]:
def preprocess_texts(text):
    '''cleaning string from input'''
    m = Mystem()
    lemmas = m.lemmatize(text)
    text = ' '.join(lemmas)
    text = text.lower()
    text = ''.join([w for w in text if not re.match(r'[A-Z]+', w, re.I)])
    text = ''.join([w for w in text if not w.isdigit()])
    text = re.sub(r'/|\n|\+|—|…|<.*?>|«|»', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    translator = str.maketrans(' ', ' ', string.punctuation)
    clean_text = text.translate(translator)
    return clean_text

In [211]:
# следующийтекст is delimeter cause preprocesing works faster on larger text
string_idioms = ' следующийтекст '.join(list(idioms_dict['idioms']))

In [212]:
string_idioms = preprocess_texts(string_idioms)
lem_idioms = re.split(" следующийтекст ", string_idioms)

In [157]:
idioms_dict['lem_idioms'] = lem_idioms
idioms_dict['lem_idioms'] = idioms_dict['lem_idioms'].str.rstrip()
idioms_dict['lem_idioms'] = idioms_dict['lem_idioms'].str.replace('\s+', '\W+')
idioms_dict = idioms_dict.reset_index().drop('index', axis = 1)

In [158]:
body_parts = pd.read_csv('./data/body.parts.complete.csv')
body_parts = list(body_parts['body_parts'])
body_parts.append("язык")

In [159]:
idioms_dict = idioms_dict[idioms_dict.lem_idioms.str.contains('|'.join(body_parts))]

In [160]:
idioms_dict = idioms_dict.drop_duplicates()

In [161]:
idioms_dict.to_csv('./data/lem_idioms_dict.csv', index = False)

In [213]:
clean_texts = []
for i in tqdm(range(len(texts))):
    clean_texts.append(preprocess_texts(texts[i]))

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

In [214]:
texts_df['clean_texts'] = clean_texts
texts_df.to_csv('./data/sample_texts_df_detcorpus.csv', index = False)

In [215]:
def get_matches(files_names, clean_texts, patterns):
    matched_files = []
    matched_idioms = []

    for file_name, clean_text in tqdm(zip(files_names, clean_texts)):
        for idiom in patterns:
    #         # e<=n is the number of diviations from original pattern, n=0 would match the exact one
    #         # re's match works faster than regex's, so for literal match is better to use re package
    #         match = regex.search('(' + idiom + '){e<=0}', sample_text)
            match = re.search(idiom, clean_text)
            if match:
                matched_idioms.append(idiom)
                matched_files.append(file_name)
    return matched_files, matched_idioms

In [216]:
matched_files, matched_idioms = get_matches(files_names, clean_texts, list(idioms_dict['lem_idioms']))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [217]:
matched_files_body, matched_body_parts = get_matches(files_names, clean_texts, body_parts)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [227]:
matched_idioms_df = pd.DataFrame({'files': matched_files, 'lem_idioms': matched_idioms})
matched_idioms_stat = matched_idioms_df.groupby('lem_idioms').files\
    .count().reset_index().sort_values('files', ascending = False)
matched_idioms_top = matched_idioms_stat.head(10)
matched_idioms_top = matched_idioms_top.sort_values('files')
matched_idioms_top = matched_idioms_top.merge(idioms_dict, on = 'lem_idioms', how = 'left')

In [228]:
def plot_bar_chart(x, y):
    data = [go.Bar(
            x = x,
            y = y,
            orientation = 'h'
    )]

    layout = go.Layout(
        autosize=False,
        width=800,
        height=500,
        margin=dict(
            l=300,
            r=20,
            b=100,
            t=100,
            pad=4
        ))

    figure=go.Figure(data=data,layout=layout)

    return iplot(figure, filename='horizontal-bar')

In [229]:
plot_bar_chart(matched_idioms_top['files'], matched_idioms_top['idioms'])

In [192]:
mean_text_out = 'Average num of idioms per text:'
mean_out =  str(int(round(pd.DataFrame(matched_idioms_df\
    .groupby('files').lem_idioms.count()).reset_index().lem_idioms.mean(), 0)))

max_text_out = 'Maximum num of idioms in text:'
max_out =  str(pd.DataFrame(matched_idioms_df\
    .groupby('files').lem_idioms.count()).reset_index().lem_idioms.max(), 0)

display(Markdown('{} **{}**'.format(mean_text_out, mean_out)))
display(Markdown('{} **{}**'.format(max_text_out, max_out)))

Average num of idioms per text: **10**

In [225]:
matched_body_parts_df = pd.DataFrame({'files': matched_files_body, 'body_part': matched_body_parts})
matched_body_parts_stat = matched_body_parts_df.groupby('body_part').files\
    .count().reset_index().sort_values('files', ascending = False)
matched_body_parts_top = matched_body_parts_stat.head(10)
matched_body_parts_top = matched_body_parts_top.sort_values('files')

In [231]:
matched_body_parts_top

Unnamed: 0,body_part,files
32,нога,30
45,рот,30
24,ладонь,30
25,лицо,30
26,лоб,30
35,нос,30
36,палец,30
38,плечо,30
46,рука,30
21,кожа,30
