In [1]:
# !pip install catboost
# !pip install gensim
# !pip install pymystem3

In [2]:
import numpy as np
import pandas as pd
import datetime
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split



In [3]:
df = pd.read_csv('./data/tweet_cleaned.csv').sample(frac = 1)

In [4]:
df.head(5)

Unnamed: 0,text,target,clean_tweet
108738,Азазаза ;3\nМне начинает нравится это фото) ht...,0,азазаза начинает нравится это фото
208661,Снова начинается подобная хрень:( http://t.co/...,1,снова начинается подобная хрень печаль
52972,RT @VasilyevaDi: стоим дежурим:)вспомнили наши...,0,пользователь стоим дежурим вспомнили наших мал...
59376,Ертенгі собрга 2 узын ниггер бірге барат) как ...,0,ертенгі собрга узын ниггер бірге барат
45607,Новоиспеченные -Александр Маршал и Эми Уайнхау...,0,новоиспеченные александр маршал эми уайнхауз р...


In [5]:
import gensim.downloader as api

In [6]:
model = api.load("word2vec-ruscorpora-300")

In [2]:
from pymystem3 import Mystem

m = Mystem()

pos_conv = {
    "A": "ADJ",
    "ADV": "ADV",
    "ADVPRO": "ADV",
    "ANUM": "ADJ",
    "APRO": "DET",
    "COM": "ADJ",
    "CONJ": "SCONJ",
    "INTJ": "INTJ",
    "NONLEX": "X",
    "NUM": "NUM",
    "PART": "PART",
    "PR": "ADP",
    "S": "NOUN",
    "SPRO": "PRON",
    "UNKN": "X",
    "V": "VERB",
}

default_word = 'человек_NOUN'

def tag(word):
    try:
        processed = m.analyze(word)[0]
        lemma = processed["analysis"][0]["lex"].lower().strip()
        pos = processed["analysis"][0]["gr"].split(',')[0]
        pos = pos.split('=')[0].strip()
        tagged = f"{lemma}_{pos_conv.get(pos) or 'NOUN'}"
        return tagged
    except:
        return default_word

In [8]:
def get_vec(text, agg_func):
    v = []
    for w in text.split(' '):
        tagged = tag(w)
        v.append(model.get_vector(tagged) if model.vocab.get(tagged) else model.get_vector('человек_NOUN'))
    return agg_func(np.array(v))

In [9]:
def _avg(v):
    return np.sum(v, axis=0)/v.shape[0]

def _max(v):
    return np.max(v, axis=0)

def _min(v):
    return np.min(v, axis=0)

In [10]:
df.fillna(default_word, inplace=True)

In [11]:
example_v = model.get_vector(default_word)

columns_feat = [f'min_{x}' for x in range(example_v.shape[0])] \
    + [f'max_{x}' for x in range(example_v.shape[0])] \
    + [f'avg_{x}' for x in range(example_v.shape[0])]

hash_vectorizer = HashingVectorizer(n_features=2**9)

hash_cols = [f'hvec_{x}' for x in range(2**9)]

df_vec = pd.DataFrame(columns=columns_feat+hash_cols)

In [15]:
with open('./msg_columns.pickle', 'wb') as f:
    pickle.dump(columns_feat+hash_cols, f)

In [12]:
import pickle

with open('./msg_hash_vec.pickle', 'wb') as f:
    pickle.dump(hash_vectorizer, f)

In [13]:
def get_feat(text, aggr):
    if type(text) is not str:
        text = default_word
    vec = get_vec(text, _min).tolist() + get_vec(text, _max).tolist() + get_vec(text, _avg).tolist()
    return vec

In [14]:
%%time

h = np.array(hash_vectorizer.fit_transform(df['clean_tweet']).toarray())

a = datetime.datetime.now().replace(microsecond=0)

c = 1000

for x in range(50):
    df_vec_partial = pd.DataFrame(columns=columns_feat+hash_cols)    

    i=0  

    for index, row in df[x*c:(x+1)*c].iterrows():
        if i % 1000 == 0:
            b = datetime.datetime.now().replace(microsecond=0)
            print(x, b-a)
            a = b
        
        text = row['clean_tweet']

        if type(text) is not str:
            text = default_word

        df_vec_partial.loc[i] = get_vec(text, _min).tolist() \
            + get_vec(text, _max).tolist() + get_vec(text, _avg).tolist() \
            + h[x+i,:].tolist()

        i+=1
        
    df_vec_partial['target'] = df[x*c:(x+1)*c]['target'].to_list()
    df_vec_partial.to_csv(f'./data/tweets_partials/tweet_prepared_{x}.csv', index=False)

0 0:00:00
1 0:00:12
2 0:00:12
3 0:00:11
4 0:00:11
5 0:00:11
6 0:00:11
7 0:00:12
8 0:00:11
9 0:00:11
10 0:00:11
11 0:00:12
12 0:00:11
13 0:00:11
14 0:00:11
15 0:00:11
16 0:00:12
17 0:00:11
18 0:00:13
19 0:00:11
20 0:00:11
21 0:00:11
22 0:00:11
23 0:00:12
24 0:00:11
25 0:00:12
26 0:00:11
27 0:00:12
28 0:00:12
29 0:00:11
30 0:00:11
31 0:00:12
32 0:00:11
33 0:00:12
34 0:00:12
35 0:00:11
36 0:00:11
37 0:00:12
38 0:00:11
39 0:00:12
40 0:00:11
41 0:00:12
42 0:00:11
43 0:00:11
44 0:00:12
45 0:00:12
46 0:00:12
47 0:00:11
48 0:00:11
49 0:00:12
CPU times: user 6min 42s, sys: 52.9 s, total: 7min 35s
Wall time: 9min 33s
