In [1]:
!pip install stanza
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.5.0.tar.gz (355 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.8/355.8 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.5.0-py2.py3-none-any.whl size=351211 sha256=aa7ec17259a860a58ef8f1a0251052918fc0fd079e9c736098aac2c7e6665f63
  Stored in directory: /root/.cache/pip/wheels/2a/0c/2c/07b5af72b120503fe24590691d24c462a25e5e530db8700a96
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully in

In [2]:
import warnings
warnings.filterwarnings("ignore")
%config Completer.use_jedi = False

import pandas as pd
from tqdm import tqdm
import stanza
import json
import pymorphy2
from nltk.tokenize import sent_tokenize

In [3]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [4]:
def load_stop_words():
    stopwords = []
    path_to_file = "/content/Stopwords.txt"
    with open(path_to_file, "r", encoding="utf-8") as fl:
        for line in fl:
            stopwords.append(line.strip("\n"))
    return stopwords

In [5]:
def norm_form(morph, word):
    return morph.parse(word)[0].normal_form

## Load preprocessors

In [6]:
morph = pymorphy2.MorphAnalyzer(lang="ru")

In [7]:
nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,ner,depparse', use_gpu = True)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/tokenize/syntagrus.pt:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/pos/syntagrus.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/lemma/syntagrus.pt:   0%|      …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/depparse/syntagrus.pt:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/ner/wikiner.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/forward_charlm/newswiki.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/pretrain/fasttextwiki.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/backward_charlm/newswiki.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/pretrain/syntagrus.pt:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


## Load data

In [8]:
stopwords = load_stop_words()

## Data was loaded from source:   https://gorod.mos.ru/

In [10]:
df = pd.read_csv('/content/temp.csv', sep="$")

### Clearing & Filtering data

In [11]:
df["lens"] = df["message"].apply(lambda x: len(str(x)))

In [12]:
df_filtered = df[df["lens"] > 50]

In [13]:
df_filtered.shape

(56698, 3)

In [14]:
df_filtered["theme_value"].value_counts()

2    24021
0    16878
1    15799
Name: theme_value, dtype: int64

In [15]:
df_filtered.head()

Unnamed: 0,message,theme_value,lens
1,По адресу Снежная д24 расположена музыкальная ...,0,358
2,После проведения работ на кабельной канализаци...,0,127
3,Очистите опору освещения. Приведите в надлежащ...,0,59
4,Более двух недель лежит куча грунта в перемешк...,1,58
5,"Осколки бордюрного камня спрятаны за дерево, в...",1,68


In [16]:
df_0 = df_filtered[df_filtered["theme_value"]==0]
df_1 = df_filtered[df_filtered["theme_value"]==1]
df_2 = df_filtered[df_filtered["theme_value"]==2]

## Tokenizing

In [17]:
full_corpus = df_0["message"].values

In [18]:
try:
    sentences = [sent for corp in full_corpus for sent in sent_tokenize(corp, language="russian")]
except:
    import nltk
    nltk.download('punkt')
    sentences = [sent for corp in full_corpus for sent in sent_tokenize(corp, language="russian")]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [19]:
long_sents = [i for i in sentences if len(i) > 20]

In [20]:
len(long_sents), len(sentences)

(32713, 36960)

## Get Triplets

In [21]:
%%time
triplets = []
for s in tqdm(long_sents):
    doc = nlp(s)
    for sent in doc.sentences:
            entities = [ent.text for ent in sent.ents]
            res_d = dict()
            temp_d = dict()
            for word in sent.words:
                temp_d[word.text] = {"head": sent.words[word.head-1].text, "dep": word.deprel, "id": word.id}
            for k in temp_d.keys():
                nmod_1 = ""
                nmod_2 = ""
                if (temp_d[k]["dep"] in ["nsubj", "nsubj:pass"]) & (k in entities):
                    res_d[k] = {"head": temp_d[k]["head"]}

                    for k_0 in temp_d.keys():
                        if (temp_d[k_0]["dep"] in ["obj", "obl"]) &\
                           (temp_d[k_0]["head"] == res_d[k]["head"]) &\
                            (temp_d[k_0]["id"] > temp_d[res_d[k]["head"]]["id"]):
                            res_d[k]["obj"] = k_0
                            break

                    for k_1 in temp_d.keys():
                        if (temp_d[k_1]["head"] == res_d[k]["head"]) & (k_1 == "не"):
                            res_d[k]["head"] = "не "+res_d[k]["head"]

                    if "obj" in res_d[k].keys():
                        for k_4 in temp_d.keys():
                            if (temp_d[k_4]["dep"] =="nmod") &\
                               (temp_d[k_4]["head"] == res_d[k]["obj"]):
                                nmod_1 = k_4
                                break

                        for k_5 in temp_d.keys():
                            if (temp_d[k_5]["dep"] =="nummod") &\
                               (temp_d[k_5]["head"] == nmod_1):
                                nmod_2 = k_5
                                break
                        res_d[k]["obj"] = res_d[k]["obj"]+" "+nmod_2+" "+nmod_1

            if len(res_d) > 0:
                triplets.append([s, res_d])

100%|██████████| 32713/32713 [56:05<00:00,  9.72it/s]

CPU times: user 55min 15s, sys: 13.5 s, total: 55min 29s
Wall time: 56min 5s





In [22]:
clear_text = lambda x: "".join(i if (i.isdigit()) | (i.isalpha()) | (i in [" "]) else " " for i in x )

clear_triplets = dict()
for tr in triplets:
    for k in tr[1].keys():
        if "obj" in tr[1][k].keys():
            ## clear_text убрать, если не нужна очистка предложений
            clear_triplets[clear_text(tr[0])] =  [k, tr[1][k]['head'], tr[1][k]['obj']]

In [23]:
for_df = []
for k in clear_triplets.keys():
    for_df.append([k]+clear_triplets[k])

## Create DF for prepare

In [24]:
df_triplets = pd.DataFrame(for_df, columns=["full_sent", "subject", "verb", "object"])
df_triplets.shape

(590, 4)

In [25]:
df_triplets["subj_n_f"] = df_triplets["subject"].apply(lambda x: norm_form(morph, x))
df_triplets["obj_n_f"] = df_triplets["object"].apply(lambda x: norm_form(morph, x))

In [26]:
df_triplets.head(5)

Unnamed: 0,full_sent,subject,verb,object,subj_n_f,obj_n_f
0,МОЭСК во время проведения аварийно восстановит...,МОЭСК,вскрыло,покрытие,моэск,покрытие
1,Электрики всячески оттягивают выполнение работ,Электрики,оттягивают,выполнение работ,электрик,выполнение работа
2,Ямы находятся на траектории движения во двор,Ямы,находятся,траектории движения,ям,траектории движение
3,Волга очень давно стоит на одном месте тем са...,Волга,стоит,месте,волга,месте
4,Четыре недели назад прокладывая коммуникации в...,Стрелецкая,засыпали,щебней,стрелецкий,щебней


In [27]:
df_filtered = df_triplets[(~df_triplets["subj_n_f"].isin(stopwords)) &\
                          (~df_triplets["obj_n_f"].isin(stopwords))].sort_values(by="obj_n_f", ascending=False, ignore_index=True)

In [28]:
df_filtered.shape

(569, 6)

In [29]:
df_filtered.head(3)

Unnamed: 0,full_sent,subject,verb,object,subj_n_f,obj_n_f
0,На парковке магазина Мираторг впервые размести...,Мираторг,разместили,Ярмарку дня,мираторг,ярмарку дний
1,Мусор сваливается в ямы потом его засыпают зе...,Мусор,сваливается,ямы,мусор,ямы
2,Асфальт вокруг трашеи в дырах и вмятинах nВ...,Красноказарменная,не закапывают,ямы,красноказарменный,ямы


## Split data on chunks

In [30]:
groups = list(chunks(df_filtered["obj_n_f"].unique(), 100))
len(groups)

5

In [31]:
gr_num = 0
df_for_draw = df_filtered[df_filtered["obj_n_f"].isin(groups[gr_num])]

In [32]:
nodes = pd.unique(df_for_draw[["subj_n_f", "obj_n_f"]].values.ravel("K"))

In [33]:
nodes.shape

(198,)

## Get edges & edges info

In [34]:
df_d_d = df_for_draw.drop_duplicates(subset=["subj_n_f", "obj_n_f", "verb"])[["subj_n_f", "obj_n_f", "verb", "full_sent"]]

In [35]:
df_d_d.shape, df_for_draw.shape

((139, 4), (140, 6))

In [36]:
info_dict = dict()
label_dict = dict()
for cc, raw in enumerate(df_d_d.values):
    info_dict[(raw[0], raw[1])] = {f"sent_{cc}": raw[3]}
    label_dict[(raw[0], raw[1])] = raw[2]

In [37]:
word_num = dict()
for c, word in enumerate(nodes):
    word_num[word] = c+1

## Draw Graph

In [38]:
header_text = r"""<!doctype html>
                  <html lang="ru">
                    <head>
                    <meta charset="utf-8"/>
                    <title>Interractive graph</title>
                    <style>
                      .parent > div {
                        background: #eee;
                        float: left;
                        margin-right: 2%;
                        padding: 10px;
                        border: 1px solid #ccc;
                        -webkit-box-sizing: border-box;
                        -moz-box-sizing: border-box;
                        box-sizing: border-box;
                      }
                      #all_information{
                        overflow-y:scroll;
                        overflow-x:scroll;
                      }
                      #mynetwork {
                          width: 1600px;
                          height: 800px;
                          border: 1px solid lightgray;
                      }
                      table, td,th {
                        border: 1px solid black;
                        border-collapse: collapse;
                      }
                    </style>
                    </head>
                    <body>

                    <div class="parent">
                        <div id="mynetwork"></div>
                        <div id="all_information"></div>
                    </div>

                    <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.css">
                    <script src="https://visjs.github.io/vis-network/standalone/umd/vis-network.min.js"></script>
                    <script>"""

tail_text = """
      var container = document.getElementById('mynetwork');
      var data = {
        nodes: nodes,
        edges: edges
      };
      var options = {
        "physics":{
          "barnesHut":{
            "gravitationalConstant": -4000,
            "springConstant": 0.006,
            "damping":0.2
          },
          "repulsion":{"nodeDistance":300}
        }
      };

    var network = new vis.Network(container, data, options);
    var res_container = document.getElementById('all_information');

    network.on( 'click', function(properties) {
        var clickedNodes = nodes.get(properties.nodes);
        var clickedEdges = edges.get(properties.edges);
        var clickedObject;

        if (clickedNodes.length==0 & clickedEdges.length==0){
            console.log("")
            } else {
                    clickedObject = clickedEdges[0]
                    var total_sum_by_doctype = clickedObject['info'];
                    var tsbd_labels = Object.keys(total_sum_by_doctype);
                    var tsbd_values = Object.values(total_sum_by_doctype);
                    createTable(tsbd_labels, tsbd_values);
                    }
    });

    function createTable(labels, values){
      var table = document.getElementById('text_information');
      if (table != null)
      {
        table.remove(table);
        table = document.createElement('table');
        table.setAttribute('id', 'text_information');
        res_container.appendChild(table);
      }
      else{
        table = document.createElement('table');
        table.setAttribute('id', 'text_information');
        res_container.appendChild(table);
      }
      var tr_header = document.createElement('tr');
      var th_col_label = document.createElement('th');
      th_col_label.innerHTML = "Номер";
      var th_col_value = document.createElement('th');
      th_col_value.innerHTML = "Предложение, которое содержит ребро";
      tr_header.appendChild(th_col_label);
      tr_header.appendChild(th_col_value);
      table.appendChild(tr_header);

      for (var i=0; i < labels.length; i++){
        var tr = document.createElement('tr');
        let td1 = document.createElement('td');
        td1.innerHTML = labels[i]
        let td2 = document.createElement('td');
        td2.innerHTML = values[i]
        tr.appendChild(td1);
        tr.appendChild(td2);
        table.appendChild(tr);
      }
    }


    </script>
    <style>
      #all_information {
        overflow-y:scroll;
      }
    </style>
    </body>
    </html>"""

In [39]:
#import script_for_graph
#import importlib
#importlib.reload(script_for_graph)
#from script_for_graph import header_text, tail_text

header_text += """\nvar nodes = new vis.DataSet([\n"""
for w in nodes:
    header_text += "{"
    header_text += f"""         id: {word_num[w]},
                                label: "{w}"\n"""
    header_text += "},"
header_text += "   ]);\n"

header_text += """var edges = new vis.DataSet(["""
for k in info_dict.keys():
    header_text += "{"
    header_text += f"""       from: {word_num[k[0]]},
                    to: {word_num[k[1]]},
                    arrows: "to",
                    label: "{label_dict[k]}",
                    info: {info_dict[k]}\n"""
    header_text +="},"
header_text += "   ]);\n"

full_text = ""
full_text += header_text
full_text += tail_text

with open(f"Graph_for_group_{gr_num}.html", "w", encoding="utf-8") as f:
    f.write(full_text)