In [1]:
import os
import re
from random import randint

In [2]:
import gensim
from gensim.models import word2vec



In [3]:
# folder = os.path.join('data', '00_dec')
folder = os.path.join('data', '17_dec')

In [6]:
raw_log_file = os.path.join(folder, '17_dec_raw.txt')
clean_log_file = os.path.join(folder, 'clean_log_dratuti.txt')
raw_msgs_file = os.path.join(folder, 'raw_msgs.txt')
ltrs_only_msgs_file = os.path.join(folder, 'ltrs_only_msgs.txt')
no_stopwords_msgs_file = os.path.join(folder, 'no_stopwords_msgs.txt')
normalized_no_stopwords_msgs_file = os.path.join(folder, 'normalized_no_stopwords_msgs.txt')

In [7]:
def write_msgs(msgs, filename):
    with open(filename, 'w') as lf:
        lf.write('\n'.join([' '.join(msg) for msg in msgs]))

# Preprocessing

* convert to lowercase
* remove 
    * urls
    * mentions (start with `@`)
    * punctuation
    * telegram commands (start with `/`)

In [8]:
with open(raw_log_file) as lf:
    log = lf.readlines()
print(len(log))

108177


In [9]:
log = list(filter(lambda line: not line.startswith('/'), log))

regex_url = re.compile(r'https?://[^\s]*')
raw = regex_url.sub(' ', ''.join(log).lower())

regex_mention = re.compile(r'[^\w\d]?@[\w\d]+[^\w\d]')
raw = regex_mention.sub(' ', raw)

raw = re.sub('ё', 'е', raw)

raw_msgs = raw.splitlines()
write_msgs(raw_msgs, raw_msgs_file)

ltrs_str = 'abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьыъэюя'
ltrs = {l : l for l in ltrs_str}
raw_txts = [''.join([ltrs.get(ch, ' ') for ch in txt]) for txt in raw_msgs]

raw_txts = list(map(lambda txt: re.sub(' +', ' ', txt.strip()), raw_txts))
raw_txts = list(filter(lambda txt: txt, raw_txts))

In [10]:
i = randint(0, len(raw_txts))
raw_txts[i : i + 10]

['hell',
 'hell',
 'hell',
 'hell',
 'hell',
 'hell',
 'hell',
 'hell',
 'hell',
 'hell']

In [11]:
msgs = [txt.split() for txt in raw_txts]
write_msgs(msgs, ltrs_only_msgs_file)
msgs[:5]

[['поздравляю'],
 ['последний', 'день'],
 ['отсюда'],
 ['куда'],
 ['уже', 'не', 'уходишь']]

# Remove stopwords

In [12]:
from nltk.corpus import stopwords
sw = stopwords.words('russian')

In [13]:
msgs_nosw = list(filter(lambda x: x, [list(filter(lambda x: x not in sw, msg)) for msg in msgs]))
write_msgs(msgs_nosw, no_stopwords_msgs_file)
msgs_nosw[:10]

[['поздравляю'],
 ['последний', 'день'],
 ['отсюда'],
 ['уходишь'],
 ['х', 'т'],
 ['чо', 'хуйня'],
 ['н', 'л', 'т'],
 ['voiceru', 'bot', 'глупый', 'глупый'],
 ['неправильный', 'запрос', 'попробуй'],
 ['работает', 'inline', 'режиме']]

# Normalize words with `pymorphy2`

In [14]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def to_norm(word, morph_analyzer = morph):
    if word == 'чай':
        return word
    
    norm = morph.parse(word)[0].normal_form
    
    if norm in ['пидора', 'пидра', 'вегана']:
        return norm[:-1]
    
    if norm == 'нихуй':
        return 'нихуя'
    
    if norm == 'хуйль':
        return 'хуйли'
    
    if norm == 'хула':
        return 'хуле'
    
    if norm == 'штол':
        return 'штоле'
    
    if norm == 'гея':
        return 'гей'
    
    return norm

def normalize_msg(msg):
    if type(msg) == str:
        msg = msg.split()

    for w in msg:
        norm_w = to_norm(w)
        yield norm_w

In [15]:
msgs_nosw_norm = [list(normalize_msg(msg)) for msg in msgs_nosw]
write_msgs(msgs_nosw_norm, normalized_no_stopwords_msgs_file)
msgs_nosw_norm[:25]

[['поздравлять'],
 ['последний', 'день'],
 ['отсюда'],
 ['уходить'],
 ['х', 'том'],
 ['чо', 'хуйня'],
 ['наш', 'литр', 'том'],
 ['voiceru', 'bot', 'глупый', 'глупый'],
 ['неправильный', 'запрос', 'попробовать'],
 ['работать', 'inline', 'режим'],
 ['общааться', 'он', 'напрямую'],
 ['тупой'],
 ['похуй', 'ключ'],
 ['текст'],
 ['хз', 'установиться'],
 ['указать',
  'свой',
  'ключ',
  'yandex',
  'speechkit',
  'cloud',
  'получить',
  'адрес'],
 ['дробный', 'help'],
 ['настройка'],
 ['администратор', 'добавить', 'бот', 'valentin'],
 ['режим', 'работа', 'текст', 'речь', 'речь', 'текст'],
 ['тихий', 'режим', 'служебный', 'сообщение', 'включить'],
 ['команда', 'присылать', 'угодный'],
 ['ключ', 'yandex', 'speechkit', 'cloud', 'установленный'],
 ['голос', 'maxim'],
 ['эмоция', 'доброжелательный']]

-------

In [16]:
# import sklearn
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(analyzer = "word",
#                              tokenizer = None,
#                              preprocessor = None,
#                              stop_words = sw,
#                              max_features = 1000)

# %%time
# data_features = vectorizer.fit_transform(text_ltrs)

# feats = data_features.toarray()

# sum(sum(feats))

# vectorizer.get_feature_names()[-5:]