In [1]:
import os
import pandas as pd
import numpy as np

import gensim
from sklearn.decomposition import PCA

import opencorpora
from pymystem3 import Mystem
from many_stop_words import get_stop_words

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
def lemm_and_filter(text, min_len=3, stopwords=get_stop_words('ru'), stem=Mystem(entire_input=False)):
    return ' '.join(list(filter(lambda word: word not in stopwords and len(word) > min_len, stem.lemmatize(text))))

## Opencorpora parsing

In [None]:
reader = opencorpora.CorpusReader('tmp/annot.opcorpora.xml')
oc = pd.DataFrame([{
    'title': d.title(), 
    'text': d.raw(), 
    'categories': d.categories(),
    'lemmas': lemm_and_filter(d.raw())
} for d in reader.iter_documents() if len(d.raw()) >= 100])

In [None]:
phraser = gensim.models.phrases.Phraser(gensim.models.phrases.Phrases(sentences=oc['lemmas'].str.split()))
oc['phrased'] = oc['lemmas'].str.split().map(lambda x: ' '.join(phraser[x]))

In [None]:
oc.to_csv('tmp/opencorpora.csv.gz', compression='gzip', index=False)

## Calculate vectors

In [4]:
oc = pd.read_csv('tmp/opencorpora.csv.gz', compression='gzip')
criminal_code = pd.read_csv('tmp/vectors/criminal_code.csv.gz', compression='gzip')
criminal_court_orders = pd.read_csv('tmp/vectors/criminal_court_orders.csv.gz', compression='gzip')
civil_code = pd.read_csv('tmp/vectors/civil_code.csv.gz', compression='gzip')
civil_court_orders = pd.read_csv('tmp/vectors/civil_court_orders.csv.gz', compression='gzip')

In [5]:
oc['source'] = 'opencorpora'
criminal_code['source'] = 'criminal_code'
criminal_court_orders['source'] = 'criminal_court_orders'
civil_code['source'] = 'civil_code'
civil_court_orders['source'] = 'civil_court_orders'

In [6]:
criminal_code['name'] = criminal_code.apply(lambda x: "Ст.{} {}".format(x['article_number'], x['article_name']), axis=1)
civil_code['name'] = civil_code.apply(lambda x: "Ст.{} {}".format(x['article_number'], x['article_name']), axis=1)

In [7]:
civil_court_orders['name'] = ["Гражд. дело {}".format(i) for i in range(civil_court_orders.shape[0])]

In [8]:
df = pd.concat([
    oc.reset_index()[['index', 'title', 'phrased', 'source']].rename(columns={'title':'name'}), 
    criminal_code.reset_index()[['index', 'name', 'phrased', 'source']],
    criminal_court_orders.reset_index()[['index', 'title', 'phrased',  'source']].rename(columns={'title':'name'}),
    civil_code.reset_index()[['index', 'name', 'phrased', 'source']],
    civil_court_orders.reset_index()[['index', 'name', 'phrased', 'source']],
]).reset_index(drop=True)

In [9]:
td = lambda row: gensim.models.doc2vec.TaggedDocument(words=row['phrased'].split(), tags=[row.name])
documents = df.apply(td, axis=1).tolist()

In [10]:
gensim.utils.save_as_line_sentence(df['phrased'].str.split(), 'tmp/phrased.txt')

In [None]:
model = gensim.models.doc2vec.Doc2Vec(
    documents=gensim.models.doc2vec.TaggedLineDocument('tmp/phrased.txt'), 
    epochs=500, vector_size=300, workers=4
)

In [None]:
# %%time
# model = gensim.models.doc2vec.Doc2Vec()
# model.build_vocab(documents)
# model.train(documents, total_examples=model.corpus_count, epochs=500)

In [17]:
df['vectors'] = model.docvecs.vectors_docs.tolist()

In [24]:
if not os.path.exists('tmp/doc2vec/'):
    os.mkdir('tmp/doc2vec')
model.save('tmp/doc2vec/all_texts.model')

In [25]:
if not os.path.exists('tmp/vectors/'):
    os.mkdir('tmp/vectors')
df.to_csv('tmp/vectors/all.csv.gz', index=False, compression='gzip')

In [None]:
df['pca'] = PCA(n_components=2).fit_transform(df['vectors'].tolist()).tolist()

In [None]:
METHOD = 'pca'#'tsne'
DISPLAY = ['civil_court_orders', 'criminal_court_orders', 'opencorpora', 'criminal_code', 'civil_code']

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11, 4), dpi=300, sharex='all', sharey='all')

for source in df['source'].unique():
    if source in DISPLAY:
        tmp = np.array(df[df['source'] == source][METHOD].tolist())
        ax1.scatter(tmp[:, 0], tmp[:, 1], marker='.', label=source, alpha=.4)
        ax2 = sns.kdeplot(tmp[:, 0], tmp[:, 1], ax=ax2, legend=False, shade_lowest=False)
        # ax2.text(-5, 8, "Court Orders", size=10, color='red')
    
ax1.legend()
ax1.tick_params(axis='both', which='both', left=False, bottom=False, labelleft=False, labelbottom=False)
ax2.tick_params(axis='both', which='both', left=False, bottom=False, labelleft=False, labelbottom=False)

plt.tight_layout()
plt.autoscale()
plt.show()

In [None]:
METHOD = 'pca'#'tsne'
DISPLAY = ['civil_court_orders', 'criminal_court_orders', 'opencorpora', 'criminal_code', 'civil_code']
SAMPLE_SIZE = 200

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(15, 12), dpi=300)
disable_ticks = lambda x: x.tick_params(axis='both', which='both', left=False, bottom=False, 
                                        labelleft=False, labelbottom=False)

colors_iter = iter([
    ('red', 'Reds'),
    ('orange', 'Oranges'),
    ('green', 'Greens'), 
    ('gray', 'Greys'), 
    ('purple', 'Purples'), 
    ('blue', 'Blues'), 
])

for source_name in df['source'].unique():
    if source_name in DISPLAY:
        tmp = np.array(df[df['source'] == source_name][METHOD].tolist())
        tmp_sample = np.array(df[df['source'] == source_name].sample(SAMPLE_SIZE)[METHOD].tolist())
        color, colormap = next(colors_iter)
        ax1.scatter(tmp_sample[:, 0], tmp_sample[:, 1], label=source_name, c=color)
        sns.kdeplot(tmp[:, 0], tmp[:, 1], ax=ax2, shade=False, shade_lowest=False, cmap=colormap)
        sns.kdeplot(tmp[:,0], ax=ax3, color=color)
        sns.kdeplot(tmp[:, 1], ax=ax4, color=color)

ax1.legend()
ax3.legend()
disable_ticks(ax1)
disable_ticks(ax2)
plt.tight_layout()
plt.autoscale()
plt.show()