In [1]:
import os
import re
import numpy as np
import pandas as pd
import pymorphy2
from gensim.models import Word2Vec, KeyedVectors
from functools import lru_cache
import nltk

import warnings

warnings.filterwarnings('ignore')

## Загружаем w2v и все необходимые аналайзеры

In [2]:
ru_stopwords = nltk.corpus.stopwords.words('russian')

In [3]:
morph = pymorphy2.MorphAnalyzer()

In [4]:
w2v = KeyedVectors.load('model/my_model')

In [5]:
w2v_columns = [f'w2v_{i}' for i in range(1, 301)]

In [6]:
@lru_cache(maxsize=1000000) # с кешом!
def get_normal_form(i):
    return morph.normal_forms(i)[0]

def normalize_text(x):
    return ' '.join([get_normal_form(i) for i in re.findall('\w+', x) if i not in ru_stopwords])

In [7]:
def vectorise(phrase):
    try:
        return w2v[phrase]
    except KeyError:
        return np.random.normal(0, 1, 300)

In [8]:
@lru_cache(maxsize=1000000)
def is_capital(word):
    return int(word[0].isupper())

In [9]:
@lru_cache(maxsize=100000)
def get_pos(word):
    return str(morph.parse(word)[0].tag.POS) or 'other'

@lru_cache(maxsize=100000)
def get_gender(word):
    return str(morph.parse(word)[0].tag.gender) or 'other'

@lru_cache(maxsize=100000)
def get_number(word):
    return str(morph.parse(word)[0].tag.number) or 'other'

@lru_cache(maxsize=100000)
def get_case(word):
    return str(morph.parse(word)[0].tag.case) or 'other'

In [10]:
def my_gen(words):
    i = 0
    for word in words:
        if word[-1] not in ['.', '!', '?']:
            yield i
            i += 1
        else:
            yield i
            i = 0
            
def is_first(words):
    positions = np.array([i for i in my_gen(words)])
    return (positions == 0).astype(int)

In [11]:
# todo запустить это 

## Считываем данные и генерим фичи

In [12]:
df = pd.read_csv('all.csv').drop(axis=0, columns='Unnamed: 0')

In [13]:
df['first_in_sentence'] = is_first(df.word.tolist())

In [14]:
%%time


df['is_capital'] = df.word.apply(is_capital)
df['part_of_speech'] = df.word.apply(get_pos)
df['gender'] = df.word.apply(get_gender)
df['number'] = df.word.apply(get_number)
df['case'] = df.word.apply(get_case)
df['len'] = df.word.apply(len)


Wall time: 27.9 s


## Нормализуем и векторизуем

In [None]:
df.word = df.word.apply(normalize_text)

In [None]:
df = df.join(pd.DataFrame(df.word.apply(vectorise).tolist(), columns=w2v_columns))

In [None]:
df = df.drop(columns=['file', 'word'])

In [None]:
from sklearn.preprocessing import LabelEncoder

In [19]:
from joblib import dump, load
import pickle

## Фитим и сохраняем энкодеры

In [19]:
enc = LabelEncoder()
df['part_of_speech'] = enc.fit_transform(df['part_of_speech'])
dump(enc, 'part_of_speech_encoder.joblib')

enc = LabelEncoder()
df['gender'] = enc.fit_transform(df['gender'])
dump(enc, 'gender_encoder.joblib')

enc = LabelEncoder()
df['number'] = enc.fit_transform(df['number'])
dump(enc, 'quantity_encoder.joblib')

enc = LabelEncoder()    
df['case'] = enc.fit_transform(df['case'])
dump(enc, 'case_encoder.joblib')

enc = LabelEncoder()
df['class'] = df['class'].fillna('other')
df['class'] = enc.fit_transform(df['class'])
dump(enc, 'class_encoder.joblib')

['class_encoder.joblib']

In [20]:
def create_neighbours(df):
    
    zeros_ = np.zeros(df.shape[1])
    
    left = []
   
    
    right = []
    
    
    for idx, row in enumerate(df):
        
        if row[2] == 1:
            left.append(zeros_)
            right.append(df[idx + 1])
            
        elif row[1] == 1:
            left.append(df[idx - 1])
            right.append(zeros_)
            
        else: 
            left.append(df[idx - 1])
            right.append(df[idx + 1])
    
    """left.columns = [f'prev_{col}' for col in left.columns]
    right.columns = [f'next_{col}' for col in right.columns]
    
    left = left.reset_index().drop('index', axis=1)
    df = df.reset_index().drop('index', axis=1)
    right = right.reset_index().drop('index', axis=1)"""
    
    left = pd.DataFrame(np.array(left)).drop(columns=[0], axis=1)
    df = pd.DataFrame(np.array(df)).rename(columns={0:'class'})
    right = pd.DataFrame(np.array(right)).drop(columns=[0], axis=1)
    
    #return left, df, right
    #return np.hstack([np.hstack([left, df]), right])
    return pd.concat([pd.concat([left, df], axis=1), right], axis=1)
    

## Учитываем соседей

In [21]:
%%time

df = create_neighbours(df.values)

Wall time: 6.06 s


In [26]:
df = df.astype('float64')

## Сохраняем ДФ для бустинга

In [29]:
df.to_csv('boost_ready.csv', index=False)