In [4]:
import pandas as pd
import numpy as np
from itertools import product
import re, string

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cotangentofzero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cotangentofzero/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
def read_data(filepath_data,filepath_test):
      
    #train
    data = pd.read_csv(filepath_data, sep='\t')
    data = data[['text', f'{class_name}_{problem_name}']]
    data[f'{class_name}_{problem_name}'].replace([-1,0,1,2],[0,1,2,3], inplace = True)
    
    #test
    test = pd.read_csv(filepath_test, sep='\t')
    test = test['text']
    
    return data,test

def preprocess_s1(text):
    
    text = text.lower()
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub(r'[^\w\s]',' ',str(text).lower().strip())
    text = re.sub(r'\s+',' ',text)
    text_new = text.strip()
    
    return text_new
 
def lematize(text):
    
    text_new = [morph.parse(word)[0].normal_form for word in word_tokenize(text)]
    text_new = ' '.join(text_new)
    
    return text_new

def stopword(text):
    
    stopwords_ = stopwords.words('russian')
    extra_stopwords_ = ['че','ниче','изза','user','такой','привет','всё','проклятый']
    drop_stopwords_ = ['им','не','по','про','без','надо','было']
    stopwords_ = stopwords_ + extra_stopwords_
    stopwords_ = [word for word in stopwords_ if word not in drop_stopwords_]
    text_new = [word for word in text.split(' ') if word not in stopwords_]
    text_new = ' '.join(text_new)
    
    return text_new

def preprocess_s2(text):
    
    text = re.sub(r'%', ' процент ', text)
    text = re.sub(r'счёт', ' счет ', text)
    text = re.sub(r'за([ёе])м', ' займ ', text)
    text = re.sub(r'юридическ[а-я]{2,3}\s*лиц([а-я])?', ' юл ', text)
    text = re.sub(r'физическ[а-я]{2,3}\s*лиц([а-я])?', ' физлицо ', text)
    text = re.sub(r'э/э', ' электроэнергия ', text)
    text = re.sub(r'(?<![а-яёa-z])(согл(\.)?|([сc])[\-\s+]но)(?![а-яёa-z])', ' согласно ', text)
    text = re.sub(r'(?<![а-яёa-z])вып(олн)?(\.)', ' выполнение ', text)
    text = re.sub(r'(?<![а-яёa-z])соглаш(\.|[а-я]*)(?![а-яёa-z])', ' соглашение ', text)
    text = re.sub(r'опл\.дз', ' оплата договор займа ', text)
    text = re.sub(r'(?<![а-яёa-z])(о|o|у|до)пл(\.|[а-я]*)?(?![а-яёa-z])', ' оплата ', text)
    text = re.sub(r'(?<![а-яёa-z])плата([а-я]*)?(?![а-яёa-z])', ' оплата ', text)
    text = re.sub(r'(?<![а-яёa-z])выплата', ' выплата ', text)
    text = re.sub(r'(?<![а-яёa-z])выпл(\.|[а-яёa-z]*)?(?![а-яёa-z])', ' выплата ', text)
    text = re.sub(r'(?<![а-яёa-z])у( )?слов([а-яёa-z]*)?(?![а-яёa-z])', ' условия ', text)
    text = re.sub(r'(?<![а-яёa-z])(у( )?сл(\.)?|(у)?слуг([а-яёa-z]*)?|услу( )ги)(?![а-яёa-z])', ' услуги ', text)
    text = re.sub(r'улуги|усуги|усгуги|улуг|усулги|усоуг(и)?|улсуги|усдуги|([кц])слуги|учлуг', ' услуги ', text)
    text = re.sub(r'(?<![а-яёa-z])дог(\s+)овор([а-яёa-z]*)?(?![а-яёa-z])', ' договор ', text)
    text = re.sub(r'(?<![а-яёa-z])д( )?о( )?г(\.|[а-яёa-z]*)?(?![а-яёa-z])', ' договор ', text)
    text = re.sub(r'(?<![а-яёa-z])[длп]( )?(о(р)?)?( )?(г)?о( )?в([а-яёa-z]*)?(?![а-яёa-z])', ' договор ', text)
    text = re.sub(r'(?<![а-яёa-z])(в)?( )?о(з)?мещ([а-я]*)?(?![а-яёa-z])', ' возмещение ', text)
    text = re.sub(r'(?<![а-яёa-z])возм(\.)', ' возмещение ', text)
    text = re.sub(r'(?<![а-яёa-z])откр(\.)?(?![а-яёa-z])', ' открытый ', text)
    text = re.sub(r'(?<![а-яёa-z])соц(\.)?(?![а-яёa-z])', ' социальный ', text)
    text = re.sub(r'(?<![а-яёa-z])стр( )?(а)?х([а-я]*)?(?![а-яёa-z])', ' страхование ', text)
    text = re.sub(r'(?<![а-яёa-z])(влзврат|возварат)(?![а-яёa-z])', ' возврат ', text)
    text = re.sub(r'(?<![а-яёa-z])больнич([а-я]*)?(?![а-яёa-z])', ' больничныйлист больничный ', text)
    text = re.sub(r'(?<![а-яёa-z])б/л([а-я]*)?(?![а-яёa-z])', ' больничныйлист больничный ', text)
    text = re.sub(r'(?<![а-яёa-z])пособ(\.|[а-яёa-z]*)?(?![а-яёa-z])', ' пособие ', text)
    text = re.sub(r'мат(\.)(\s+)?помо([щш])[а-я](?![а-яёa-z])', ' матпомощь ', text)
    text = re.sub(r'(?<![а-яёa-z])матер[а-я]*(\s+)?пом[а-я]*(?![а-яёa-z])', ' матпомощь ', text)
    text = re.sub(r'(?<![а-яёa-z])расч(\.)?(?![а-яёa-z])', ' расчет ', text)
    text = re.sub(r'(?<![а-яёa-z])ра(\s+)?счет(?![а-яёa-z])', ' расчет ', text)
    text = re.sub(r'(?<![а-яёa-z])поступ[а-яёa-z]*(?![а-яёa-z])', ' поступление ', text)
    text = re.sub(r'расх(\.)', ' расходы ', text)
    text = re.sub(r'(?<![а-яёa-z])расход([а-я]*)?(?![а-яёa-z])', ' расходы ', text)
    text = re.sub(r'(?<![а-яёa-z])размещ(\.)', ' размещение ', text)
    text = re.sub(r'(?<![а-яёa-z])размещ[а-я]*(?![а-яёa-z])', ' размещение ', text)
    text = re.sub(r'(?<![а-яёa-z])заявл(\.|[а-я]*)?(?![а-яёa-z])', ' заявление ', text)
    text = re.sub(r'(?<![а-яёa-z])([сc])умм[аеуыоймих]*(?![а-яёa-z])', ' сумма ', text)
    text = re.sub(r'(?<![а-яёa-z])([а-яёa-z]*)?сумм[аеуыоймих]*(?![а-яёa-z])', ' сумма ', text)
    text = re.sub(r'(?<![а-яёa-z])сумм([а-яёa-z]*)?(?![а-яёa-z])', ' сумма ', text)
    text = re.sub(r'(?<![а-яёa-z])[а-яёa-z]{2}куп([а-яёa-z]*)?(?![а-яёa-z])', ' покупка ', text)
    text = re.sub(r'(?<![а-яёa-z])валют[аыеу](?![а-яёa-z])', ' валюта ', text)
    text = re.sub(r'(?<![а-яёa-z])ср[\-\s]*в(а)?(?![а-яёa-z])', ' средства ', text)
    text = re.sub(r'(?<![а-яёa-z])сотрудник[а-я]*(?![а-яёa-z])', ' сотрудник ', text)
    text = re.sub(r'(?<![а-яёa-z])сотр(уд(н)?)?(\.)?(?![а-яёa-z])', ' сотрудник ', text)
    text = re.sub(r'(?<![а-яёa-z])работник[а-я]*(?![а-яёa-z])', ' работник ', text)
    text = re.sub(r'раб(отн)?(\.)', ' работник ', text)
    text = re.sub(r'(?<![а-яёa-z])раб(\s+)?([-])(\s+)?[а-я]*(?![а-яёa-z])', ' работник ', text)
    text = re.sub(r'(?<![а-яёa-z])насел(\.|[а-я]*)?(?![а-яёa-z])', ' население ', text)
    text = re.sub(r'имущ\.', ' имущество ', text)
    text = re.sub(r'(?<![а-яёa-z])(мобиль[а-яёa-z.]*|модуль[а-яёa-z.]*)(\s+)?(здан[а-яёa-z]*|сооруж[а-яёa-z.]*)(?![а-яёa-z])',
                  ' оборудование средство ', text)
    text = re.sub(r'пост\.(\s+)?обор\.', ' поставка оборудование ', text)
    text = re.sub(r'локдаун', ' карантин ', text)
    text = re.sub(r'намордник', ' маска ', text)
    text = re.sub(r'cпутник', ' вакцина ', text)
    text = re.sub(r'прививка', ' вакцина ', text)
    text = re.sub(r'вакцинация', ' вакцина ', text)
    text = re.sub(r'\s+',' ',text)
    text_new = text.strip()
    
    return text

def replace_dates(text):
    
    text_months = ['январ[ьяюе]', 'феврал[ьяюе]', 'март[аеу]{0,1}', 'апрел[ьяюе]', 'ма[йеюя]', 'июн[ьяюе]',
                   'июл[ьяюе]', 'август[аеу]{0,1}', 'сентябр[ьяюе]', 'октябр[ьяюе]', 'ноябр[ьяюе]', 'декабр[ьяюе]']
    text_month_pattern = '|'.join(['(?<![а-я]){}(?![а-я])'.format(text_month) for text_month in text_months])

    possible_years1 = ['0[0-9]', '1[0-9]', '[8-9][0-9]', '2[01]']
    possible_years2 = ['20', '19', '']
    possible_years3 = ['20', '19']
    possible_months = ['0[1-9]', '1[0-2]']
    possible_days = ['[1-2][0-9]', '3[0-1]', '0{0,1}[1-9]']
    possible_seps = [r'\.', r'\-', r'\,', r'\/']
    numeric_date_patterns = [r'(?<![0-9]){0}{4}{1}{5}{3}{2}(?![0-9])'.format(d, m, y1, y2, sep1, sep2)
                             for d, m, y1, y2, sep1, sep2 in
                             product(possible_days, possible_months, possible_years1, possible_years2, possible_seps,
                                     possible_seps)]
    text_date_patterns = [r'(?<![0-9])({0})?\s+месяцмесяц\s+{2}{1}(?![0-9])'.format(d, y1, y3)
                          for d, y1, y3 in product(possible_days, possible_years1, possible_years3)]
    date_pattern = '|'.join(numeric_date_patterns + text_date_patterns)
    text = re.sub(text_month_pattern, ' месяцмесяц ', text)
    text = re.sub(date_pattern, ' датадата ', text)
    
    return text

def process_data(data,test):
        
    #train
    data['text'] = data['text'].apply(preprocess_s1)
    data['text'] = data['text'].apply(lematize)
    data['text'] = data['text'].apply(stopword)
    data['text'] = data['text'].apply(preprocess_s2)
    data['text'] = data['text'].apply(replace_dates)
    
    #test
    test = test.apply(preprocess_s1)
    test = test.apply(lematize)
    test = test.apply(stopword)
    test = test.apply(preprocess_s2)
    test = test.apply(replace_dates)
    
    return data,test

def vercorize_data(data,test):
        
    #data
    X_train = data['text']
    y_train = data[f'{class_name}_{problem_name}']
    X_test = test
    
    #vectorizer fit
    vectorizer = TfidfVectorizer(use_idf=True, max_features = 2000)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    X_train = pd.DataFrame.sparse.from_spmatrix(X_train, columns = vectorizer.get_feature_names())
    X_test = pd.DataFrame.sparse.from_spmatrix(X_test, columns = vectorizer.get_feature_names())
        
    #result
    data = pd.concat([X_train,y_train], axis = 1)
    test = X_test
        
    return data,test
        
def train_model(data_init,data,test_init,test):
    
    #fit train
    X_train = data.drop(f'{class_name}_{problem_name}',axis=1)
    y_train = data[f'{class_name}_{problem_name}']
    clf = LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2', random_state=42)
    clf.fit(X_train,y_train)
    
    #predict test
    X_test = test
    y_test_pred = clf.predict(X_test)
    
    #result test
    results_test = pd.DataFrame({f'{class_name}_{problem_name}':y_test_pred},
                                 index = test_init.loc[X_test.index])
    
    return results_test

def make_prediction(data_init,data,test_init,test):
    
    data,test = vercorize_data(data,test)
    results_test = train_model(data_init,data,test_init,test)
    
    return results_test

def pipeline(filepath_data,filepath_test):
    
    data_init,test_init = read_data(filepath_data,filepath_test)
    data,test = process_data(data_init,test_init)
    results_test = make_prediction(data_init,data,test_init,test)
    
    return results_test

In [7]:
%%time

#read data
filepath_data = 'train.tsv'
filepath_test = 'test.tsv'
class_names = ['masks','quarantine','vaccines']
problem_names = ['stance','argument']

#pipeline
results_test_container = []
for class_name in class_names:
    for problem_name in problem_names:
        results_test_container.append(pipeline(filepath_data,filepath_test))

results_test_final = pd.concat(results_test_container,axis = 1)
results_test_final.replace([0,1,2,3],[-1,0,1,2],inplace = True)

#save result
results_test_final.to_csv('result.tsv', sep='\t')

CPU times: user 5min 45s, sys: 2.85 s, total: 5min 47s
Wall time: 5min 51s
