In [5]:
from tqdm.notebook import tqdm
import umap
import pymorphy2
import pandas as pd
morph = pymorphy2.MorphAnalyzer()
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords');
stopwords = set(stopwords.words('russian'))
import warnings
warnings.filterwarnings("ignore")
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:

def lemmatize(text):
    words = text.split() # разбиваем текст на слова
    #print(words)
    res = list()
    for word in words:
        if (word not in stopwords and len(word) > 1):
            p = morph.parse(word)[0]
            res.append(p.normal_form)
    text = " ".join(res)
    return text

def text_cleaner_for_tfidf(input_text:str) -> str:
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', input_text)
    rem_url = re.sub(r'http\S+', '',cleantext)
    rem_url= re.sub(r'([a-z]{2}\d+[a-z]{2})',' ',rem_url)
    rem_url = re.sub(r'!+','.', rem_url)
    rem_num = re.sub('[0-9]+', '', rem_url)
    text = re.sub(r"[-—()\"#/@;:<>{}=~|€«»$\+'_–\*°“”\\√&×•ó÷≈„()‽\+.,!]+", " ", rem_num)
    text = re.sub("!",".", text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r"\[.+\]",'', text)
    emoji_pattern=re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

    text = emoji_pattern.sub('', text)
    text = text.lower()
    return text


def dataset_preprocessor_vectorizers(data: pd.DataFrame) -> pd.DataFrame:
    data_frame = data.copy()

    indexes_to_drop = []
    # Regex для групп
    for index, row in tqdm(data_frame.iterrows()):
        # Очистка текста
        data_frame.loc[index, "Текст инцидента"] = text_cleaner_for_tfidf(data_frame.loc[index, "Текст инцидента"])
    # Лемматизация и удаление стоп слов
    for index, row in tqdm(data_frame.iterrows()):
        data_frame.loc[index, "Текст инцидента"] = lemmatize(row["Текст инцидента"])
    return data_frame




# TF IDF


class Model_tfidf_umap:
    def __init__(self, max_features=None, presaved=False, use_umap=True, vectorizer_umap_size=3):
        self.max_features = max_features
        self.vectorizer = TfidfVectorizer(max_features=self.max_features)
        self.vectorizer_name = "tfidf"
        self.vectorizer_umap_size = vectorizer_umap_size
        self.presaved = presaved
        self.use_umap = use_umap
        os.makedirs("./vectorizers/", exist_ok=True)
        os.makedirs("./umap/", exist_ok=True)
    
    def create_tfidf(self, data):
        if self.presaved:
            try:
                self.vectorizer = joblib.load(f'vectorizers/{self.vectorizer_name}_vectorizer_{self.max_features}.pkl')
                print(f"\n[LOG] Vectorizer loaded successfully")
                vocab = self.vectorizer.vocabulary_
                tf = self.vectorizer.transform(data)
                data_frame = pd.DataFrame(tf.toarray(), columns=sorted(vocab.keys()))
                del tf
                return data_frame
            except Exception as e:
                print(e)
        else:
            print(f"\n[LOG] Vectorizer for params {self.vectorizer_name} and {self.max_features} is not found\n ")
            print(f"\n[LOG] Building new vectorizer")
            tf = self.vectorizer.fit_transform(data)
            vocab = self.vectorizer.vocabulary_
            joblib.dump(self.vectorizer, f'vectorizers/{self.vectorizer_name}_vectorizer_{self.max_features}.pkl')
            data_frame = pd.DataFrame(tf.toarray(), columns=sorted(vocab.keys()))
            del tf

            return data_frame
    
    def create_umap(self,  data):
        if self.presaved:
            try:
                ump = joblib.load(f'umap/umap{self.vectorizer_umap_size}.sav')
                print(f"\n[LOG] UMAP loaded successfully")
                column_names = [f"umap_{i}" for i in range(self.vectorizer_umap_size)]
                data = ump.transform(data.values)
                
                return pd.DataFrame(data, columns=column_names)
            except Exception as e:
                print(e)
        else:
            print(f"\n[LOG] UMAP for param {self.vectorizer_umap_size}  is not found\n ")
            print(f"\n[LOG] Building new UMAP")
            column_names = [f"umap_{i}" for i in range(self.vectorizer_umap_size)]
            ump = umap.UMAP(n_components=self.vectorizer_umap_size, random_state=42)
            data = ump.fit_transform(data.values)
            joblib.dump(ump, f'umap/umap{self.vectorizer_umap_size}.sav')
            
            return pd.DataFrame(data, columns=column_names)
            
    def create_dataset(self, data):
        data_ = data.copy()
        
        data_tf_idf = self.create_tfidf(data_["Текст инцидента"].to_list())
        
        if self.use_umap:
            data_umap = self.create_umap(data_tf_idf)
            data_ = pd.concat([data_, data_umap], axis=1)
            return data_
        else:
            data_ = pd.concat([data_, data_tf_idf], axis=1)

        return data_

# presaved=False для train датасета, presaved=True для test датасета

In [17]:
train = pd.read_csv("final_full.csv")


In [18]:
cleaned_train_dataset_tfidf = dataset_preprocessor_vectorizers(train)
mtf = Model_tfidf_umap(max_features=15000, presaved=False, use_umap=True, vectorizer_umap_size=500)
train_tf = mtf.create_dataset(cleaned_train_dataset_tfidf)
train_tf["Текст инцидента"] = train["Текст инцидента"]

0it [00:00, ?it/s]

0it [00:00, ?it/s]


[LOG] Vectorizer for params tfidf and 15000 is not found
 

[LOG] Building new vectorizer

[LOG] UMAP for param 500  is not found
 

[LOG] Building new UMAP


In [19]:
train_tf.to_csv("final_full_train_tfidf_umap.csv", index=False)

In [26]:
train_tf

Unnamed: 0,Исполнитель,Группа тем,Текст инцидента,Тема,umap_0,umap_1,umap_2,umap_3,umap_4,umap_5,...,umap_490,umap_491,umap_492,umap_493,umap_494,umap_495,umap_496,umap_497,umap_498,umap_499
0,Город Пермь,Погребение и похоронное дело,Погребения - это серьезная проблема в нашей ст...,Погребение и похоронное дело,9.632434,5.843328,4.825921,5.267428,4.838480,5.585184,...,3.986301,4.832767,4.518127,4.606430,5.905228,2.230520,5.136640,4.247634,4.912774,5.024465
1,Город Пермь,Погребение и похоронное дело,"Я также считаю, что мы должны делать все возмо...",Погребение и похоронное дело,9.632818,5.844398,4.826225,5.267657,4.838474,5.582731,...,3.985582,4.832904,4.513411,4.609343,5.899348,2.233718,5.137221,4.247513,4.914371,5.026628
2,Город Пермь,Погребение и похоронное дело,"Я думаю, что многие люди уже готовы переходить...",Погребение и похоронное дело,9.633292,5.845773,4.826517,5.267705,4.838565,5.579551,...,3.984196,4.833523,4.506223,4.613117,5.891297,2.238264,5.137679,4.247270,4.916842,5.029698
3,Город Пермь,Погребение и похоронное дело,"Я считаю, что строительство новых крематориев ...",Погребение и похоронное дело,9.633298,5.846795,4.826769,5.267921,4.838515,5.577320,...,3.983566,4.833809,4.501260,4.615781,5.885362,2.241799,5.138334,4.247189,4.918561,5.031999
4,Город Пермь,Погребение и похоронное дело,"Я думаю, что мы должны обсуждать этот вопрос о...",Погребение и похоронное дело,9.636014,5.848168,4.826854,5.268915,4.838577,5.572088,...,3.981524,4.835217,4.489472,4.621041,5.874436,2.250617,5.139074,4.248012,4.922754,5.036750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24282,Министерство социального развития ПК,Социальное обслуживание и защита,"Здравствуйте, срок исполнения справки по мало...",Оказание гос. соц. помощи,9.544932,5.869118,4.962122,5.260111,4.794646,4.931410,...,4.131944,4.792159,4.409505,4.861681,5.340394,3.047543,5.294498,4.399575,4.997032,5.105596
24283,Город Пермь,Дороги,"Хлебозаводская всегда была такая ,а то ещё и ...",Ямы и выбоины на дороге,9.643231,5.821224,4.825473,5.295761,4.824869,5.613949,...,4.037890,4.797796,4.759922,4.533082,6.096114,2.092103,5.138359,4.238524,4.839138,4.939486
24284,Министерство здравоохранения,Здравоохранение/Медицина,Какая это забота о пациентах? Послали на дисп...,★ Оказание медицинской помощи не в полном объе...,9.643499,5.821293,4.825494,5.295804,4.824960,5.613823,...,4.037611,4.798004,4.758650,4.533296,6.095498,2.092844,5.138390,4.238791,4.839511,4.939777
24285,Министерство здравоохранения,Здравоохранение/Медицина,"Проблема в том, что при оптимизации в сфере з...",Нехватка материально-технического обеспечения,9.649219,5.822397,4.827133,5.298790,4.824096,5.611185,...,4.042978,4.791676,4.782164,4.534188,6.106478,2.077349,5.141673,4.239016,4.831694,4.931755


In [21]:
test = pd.read_csv("test.csv", delimiter=";")

In [23]:
cleaned_test_dataset_tfidf = dataset_preprocessor_vectorizers(test)
mtf = Model_tfidf_umap(max_features=15000, presaved=True, use_umap=True, vectorizer_umap_size=500)
test_tf = mtf.create_dataset(cleaned_test_dataset_tfidf)
test_tf = test_tf.drop(["Текст инцидента"], axis=1)
test_tf = test_tf.drop(["id"], axis=1)

0it [00:00, ?it/s]

0it [00:00, ?it/s]


[LOG] Vectorizer loaded successfully

[LOG] UMAP loaded successfully


Unnamed: 0,id,umap_0,umap_1,umap_2,umap_3,umap_4,umap_5,umap_6,umap_7,umap_8,...,umap_490,umap_491,umap_492,umap_493,umap_494,umap_495,umap_496,umap_497,umap_498,umap_499
0,0,10.083567,6.052629,4.945964,5.381650,4.906913,5.192399,4.162589,5.961731,4.791072,...,4.010097,4.616440,4.523618,5.061816,5.600125,2.019810,5.342229,4.419183,4.883100,4.952904
1,1,10.108295,6.058468,4.943386,5.404594,4.903411,5.165740,4.165943,5.962642,4.785913,...,4.014353,4.603743,4.538673,5.063688,5.622144,1.998834,5.346689,4.433655,4.885974,4.950933
2,2,10.089270,6.052933,4.946071,5.380126,4.906502,5.189994,4.164612,5.964696,4.792790,...,4.007171,4.614929,4.525910,5.060621,5.601055,2.020223,5.342227,4.422795,4.883383,4.952407
3,3,10.087297,6.051794,4.947175,5.376278,4.907317,5.194271,4.165539,5.963653,4.799147,...,4.003916,4.617456,4.522944,5.059034,5.596773,2.025775,5.341194,4.423060,4.883178,4.952647
4,4,9.865716,6.045697,4.981990,5.085854,4.946228,5.325814,4.209527,6.007402,4.964777,...,3.914927,4.702093,4.504532,4.998707,5.359671,2.277333,5.299929,4.362450,4.851777,4.959312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9738,9738,10.087441,6.054111,4.945757,5.379447,4.907123,5.192849,4.164847,5.963195,4.792313,...,4.006066,4.613571,4.522827,5.061826,5.601105,2.016875,5.341963,4.421988,4.882900,4.952485
9739,9739,10.086587,6.052201,4.946909,5.379521,4.906817,5.192990,4.165640,5.964203,4.793992,...,4.008029,4.616182,4.524624,5.060285,5.599408,2.022835,5.341575,4.423395,4.882793,4.952936
9740,9740,10.107924,6.059246,4.940022,5.389874,4.911315,5.185436,4.149291,5.955206,4.784469,...,4.015766,4.608834,4.506399,5.083268,5.593884,2.020000,5.348573,4.424423,4.886580,4.951234
9741,9741,10.091175,6.053362,4.946520,5.376593,4.908082,5.193096,4.165525,5.962625,4.792506,...,4.007244,4.616652,4.523172,5.062506,5.598155,2.022788,5.341797,4.421463,4.882349,4.952578


In [24]:
test_tf = test_tf.drop(["id"], axis=1)
test_tf

Unnamed: 0,umap_0,umap_1,umap_2,umap_3,umap_4,umap_5,umap_6,umap_7,umap_8,umap_9,...,umap_490,umap_491,umap_492,umap_493,umap_494,umap_495,umap_496,umap_497,umap_498,umap_499
0,10.083567,6.052629,4.945964,5.381650,4.906913,5.192399,4.162589,5.961731,4.791072,5.024939,...,4.010097,4.616440,4.523618,5.061816,5.600125,2.019810,5.342229,4.419183,4.883100,4.952904
1,10.108295,6.058468,4.943386,5.404594,4.903411,5.165740,4.165943,5.962642,4.785913,5.028081,...,4.014353,4.603743,4.538673,5.063688,5.622144,1.998834,5.346689,4.433655,4.885974,4.950933
2,10.089270,6.052933,4.946071,5.380126,4.906502,5.189994,4.164612,5.964696,4.792790,5.024231,...,4.007171,4.614929,4.525910,5.060621,5.601055,2.020223,5.342227,4.422795,4.883383,4.952407
3,10.087297,6.051794,4.947175,5.376278,4.907317,5.194271,4.165539,5.963653,4.799147,5.023820,...,4.003916,4.617456,4.522944,5.059034,5.596773,2.025775,5.341194,4.423060,4.883178,4.952647
4,9.865716,6.045697,4.981990,5.085854,4.946228,5.325814,4.209527,6.007402,4.964777,5.004519,...,3.914927,4.702093,4.504532,4.998707,5.359671,2.277333,5.299929,4.362450,4.851777,4.959312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9738,10.087441,6.054111,4.945757,5.379447,4.907123,5.192849,4.164847,5.963195,4.792313,5.023930,...,4.006066,4.613571,4.522827,5.061826,5.601105,2.016875,5.341963,4.421988,4.882900,4.952485
9739,10.086587,6.052201,4.946909,5.379521,4.906817,5.192990,4.165640,5.964203,4.793992,5.024412,...,4.008029,4.616182,4.524624,5.060285,5.599408,2.022835,5.341575,4.423395,4.882793,4.952936
9740,10.107924,6.059246,4.940022,5.389874,4.911315,5.185436,4.149291,5.955206,4.784469,5.021521,...,4.015766,4.608834,4.506399,5.083268,5.593884,2.020000,5.348573,4.424423,4.886580,4.951234
9741,10.091175,6.053362,4.946520,5.376593,4.908082,5.193096,4.165525,5.962625,4.792506,5.024310,...,4.007244,4.616652,4.523172,5.062506,5.598155,2.022788,5.341797,4.421463,4.882349,4.952578


In [25]:
test_tf.to_csv("final_full_test_tfidf_umap.csv", index=False)