In [1]:
%%writefile sentiment.py
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle as pkl
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
from pymorphy2.analyzer import MorphAnalyzer
import datetime

Overwriting sentiment.py


In [None]:
def add_emotional_columns(x):
    emotions = {')', '!', '('}
    for e in emotions:
        if e in x:
            return 1
    return 0


lemmatizer = MorphAnalyzer()


bad_words = {'–±–æ–º–∂', '–µ–±—É—á–∏–π', '–∞–¥', '–ø–∏–¥–∞—Ä–∞—Å–∏–Ω–∞', '–æ—Ç—Å–æ—Å–∏', '—à–ª—é—Ö–∞', '–Ω–∞—Ö—É–π', '–∫–æ–∫—Å', '–ø–∏–¥–æ—Ä', '–≥–∞–¥–∫–∏–π',
'–ª–æ—Ö', '–ø–∏–¥—Ä', '–ø–æ—à–ª–Ω—Ö', '—É—Ä–æ–¥', '—Ö—É–π', '–ø–æ–¥–∞–≤–∏—Å—å', '–ø–∑–¥—Ü', '–∂–æ–ø–∞', '–ø–µ–¥–∏–∫' '–∑–∞–µ–±–∞–ª', '–µ–±—É—á–∏–π', '–ø–∏–∑–¥–µ—Ü',
'–≥–∞–Ω–¥–æ–Ω', '–¥–æ–ª–±–æ–µ–±', '—Ö—É–µ—Å–æ—Å', '–º—Ä–∞–∑—å', '—Å–∫–æ—Ç–∏–Ω–∞', '–≥–Ω–∏–¥–∞', '–ø—Ä–æ—Å—Ç–∏—Ç—É—Ç–∫–∞', '–º–∞–Ω–¥–∞', '–±–ª—è', '–∫–æ–ª–ª–µ–∫—Ç–æ—Ä',
'–≤–∑—è—Ç–∫–∞', '–æ—Ç–∫–∞—Ç', '—Ç–µ—Ä—Ä–æ—Ä', '—Ç–µ—Ä—Ä–æ—Ä–∏–∑–º', '–≤–∑—Ä—ã–≤', '–±–æ–º–±–∞', '–¥—Ä–∞–∫–∞', '—É–±–∏–π—Å—Ç–≤–æ', '—Ç—Ä—É–ø', '—É–±–∏—Ç—å',
'–µ–±–∞—Ç—å', '—Ä–∞–∑—ä–µ–±–∞—Ç—å', '–≤—ä–µ–±–∞—Ç—å', '—á–ª–µ–Ω'}
 
good_words = {'—Å–ø–∞—Å–∏–±–æ', '–ø–æ–∑–¥—Ä–∞–≤–ª—è—é', '–ª—é–±–ª—é', '–º–∏–ª–∞—è', '–º–∏–ª—ã–π',
'–ø—Ä–∞–∑–¥–Ω–∏–∫', '—Ä–æ–∂–¥–µ–Ω–∏—è', '–∑–¥–æ—Ä–æ–≤—å—è', '–ª—é–±–∏–º–∞—è', '–º—É–∂', '–∫–æ—Ç–∏–∫', '–∫–æ—Ç'
'—Ü–µ–ª—É—é', '—Å–ø—Å', '–¥—Ä', '–¥–Ω—é—Ö–∞', '–±–ª–∞–≥–æ–¥–∞—Ä—é', '‚ù§', 'üòÇ', '—Å–ª–∞–¥–æ—Å—Ç–∏', 'üòô', '–∑–∞—è',
'–¥–æ—á–µ–Ω—å–∫–∞', 'üòä', 'üòÄ', '—Å—ã–Ω–æ–∫', 'üòò', 'üòç', '—Ö–æ—Ä–æ—à–∏–π', '‚ù§', 'üòâ', '–º–∞–º–∞', '–±–æ–≥', '–∑–¥–æ—Ä–æ–≤—å–µ',
'–¥—Ä—É–≥', 'üëã', '–±—Ä–∞—Ç', '–¥–µ—Ç–∫–∞', 'üòΩ', 'üòã', '—á–º–æ–∫', '—Å–ø–∞—Å', '–∫—Ä–∞—Å–∞–≤—á–∏–∫', 'üòà',
'—Å–≤–∞–¥—å–±–∞', '—Å—á–∞—Å—Ç—å–µ', 'üí™', 'üòÅ', 'üòé', 'üí∞', '–¥—Ä—É–≥', '–¥—Ä—É–∂–æ—á–µ–∫', '–±–∏—Å–º–∏–ª–ª–∞—Ö', '–ø–∞–ø–∞', 'üòã', '–ª—é–±–∏–º–æ–π',
'—Ä—ç—Ö–º—ç—Ç', '–ø—Ä–∏—è—Ç–Ω—ã–π', '–ª—é–±–æ–≤—å', '–¥–æ—Ä–æ–≥–∞—è', '–¥–æ—Ä–æ–≥–æ–π', '–ø–æ–∂–∞–ª—É–π—Å—Ç–∞', 'üòÇ', '–ª—é–±–∏–º–æ–π', '–∫—Ä–∞—Å–æ—Ç–∞', 'ü§ó',
'–¥–æ—á—å'}

def razmetka_positive(x):
    x = x.lower().split()
    for i in range(len(x)):
        x[i] = ''.join(e for e in x[i] if e.isalnum())
        
    lemmatized_word = set()
    global good_words
    for w in x:
        lemm = lemmatizer.parse(w)[0].normal_form
        lemmatized_word.add(w)
    
    intersect = len(lemmatized_word & good_words)
    
    if intersect > 0:
        return 1

    return 0
    
    
def razmetka_negative(x):
    x = x.lower().split()
    for i in range(len(x)):
        x[i] = ''.join(e for e in x[i] if e.isalnum())
        
    lemmatized_word = set()
    global bad_words
    for w in x:
        lemm = lemmatizer.parse(w)[0].normal_form
        lemmatized_word.add(w)
    
    intersect = len(lemmatized_word & bad_words)
    
    if intersect > 0:
        return 1

    return 0

In [None]:
data = pd.read_pickle('new_data.pkl')

data['emotional_column'] = data['message'].apply(add_emotional_columns)
data['pos_target'] = data['message'].apply(razmetka_positive)
data['neg_target'] = data['message'].apply(razmetka_negative)

data.drop('target',axis=1,inplace=True)

data.to_csv('training_data_with_razmetka_final.csv', index=False)

In [175]:

        
tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)

# morph analyzer for text lemmatization
morph = pymorphy2.MorphAnalyzer()
fasttext = FastTextKeyedVectors.load('187/model.model')
pos_log_reg = pkl.load(open('pos_log_reg.pkl', 'rb'))
neg_log_reg = pkl.load(open('neg_log_reg.pkl', 'rb'))
pos_log_reg_dost = pkl.load(open('pos_log_reg_dost.pkl', 'rb'))
neg_log_reg_dost = pkl.load(open('neg_log_reg_dost.pkl', 'rb'))

old_data = pd.read_pickle('data/new_data.pkl')
old_data['index'] = old_data.index

training_data = pd.read_csv('data/training_data_with_razmetka_final.csv')

data_new = training_data.merge(old_data, on=['index','message'])
cut_date = lambda x: datetime.date(x.year, x.month, x.day)
data_new['local_datetime'] = pd.to_datetime(data_new.local_datetime).apply(cut_date)

# function for performing parallel computing on cpu
def parallelization(func, massive, jobs=None, tq=True):
    num_cores = multiprocessing.cpu_count() if jobs is None else jobs
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results


def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]


def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def get_words(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 2, re.findall('[–∞-—è–ê-–Øa-zA-Z]+', text))#re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)

def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in get_words(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words


def get_text_vectors(text):
    matrix = np.zeros((len(text), 300))
    for i,word in enumerate(text):
        vector = fasttext[word]
        matrix[i] = vector
        
    return matrix

def get_dost_vector(pred):
    return np.array([pred['positive'], pred['skip'], pred['speech'], pred['neutral'], pred['positive']])


def preprocess(texts):
    # embedding vectors weighted with tfidf
    preprocessed_texts = parallelization(text2canonicals, texts)
    lengths = np.array(list(map(lambda x: len(x) if len(x) > 0 else 1, preprocessed_texts)))
    
    texts = list(map(lambda x: ' '.join(x), preprocessed_texts))
    vectorizer = TfidfVectorizer()
    tfifd_vectorized = vectorizer.fit_transform(texts).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))
    
    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors
    weighted_embeddings /= lengths.reshape(-1, 1)
    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings

def preprocess_single_text(text):
    # embedding vectors weighted with tfidf
    preprocessed_text = text2canonicals(text)
    length = len(preprocessed_text) if len(x) > 0 else 1
    
    preprocessed_text = ' '.join(preprocessed_text)
    vectorizer = pkl.load(open('vectorizer.pkl', 'wb'))
    tfifd_vectorized = vectorizer.transform([text]).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))
    
    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors
    weighted_embeddings /= length
    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings


def preprocess2(texts, use_dost=False):
    # mean embedding vectors
    if use_dost:
        preds = model.predict(texts)
        dost_vectors = np.array(list(map(get_dost_vector, preds)))
    
    #preprocessed_texts = parallelization(text2canonicals, texts)
    preprocessed_texts = list(map(lambda x: x.lower().split(), texts))
    
    embeddings = np.zeros((len(texts), 300))
    for i, text in enumerate(preprocessed_texts):
        vectors = get_text_vectors(text)
        if vectors.shape[0] > 0:
            vector = np.mean(vectors, axis=0)
        else:
            vector = np.random.randn(300,)
        embeddings[i] = vector
        
    if use_dost:
        return np.concatenate((embeddings, dost_vectors), axis=1)
    return embeddings


def emotional(x):
    emotions = {')', '!', '('}
    for e in emotions:
        if e in x:
            return 1
    return 0


def sentiment_analysis(sentences, use_dost=True):
    emotional_col = list(map(lambda x: emotional(x), sentences))
    prep_sent = preprocess2(sentences, True)
    inputs = np.concatenate((prep_sent, np.array(emotional_col).reshape(-1, 1)), axis=1)
    if use_dost:
        preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg_dost.predict_proba(inputs)))
        preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg_dost.predict_proba(inputs)))
    else:
        preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg.predict_proba(inputs)))
        preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg.predict_proba(inputs)))
    
    return np.array([preds_pos, preds_neg]).T


def get_date_list(numdays, base):
    date_list = [base - datetime.timedelta(days=x) for x in range(numdays)]
    return date_list


def sent_analyse_dates(period, base=datetime.date(2019, 9, 3)):
    if period == 'week':
        date_list = get_date_list(7, base)
        data_period = data_new[data_new.local_datetime.isin(date_list)]

    elif period == 'month':
        date_list = get_date_list(31, base)
        data_period = data_new[data_new.local_datetime.isin(date_list)]

    elif period == 'all':
        data_period = data_new
    else:
        raise ValueError('invalid period name')
        
    predictions = sentiment_analysis(data_period.message.values)
    return np.mean(predictions, axis=0)

Overwriting sentiment.py


In [28]:
#pkl.dump(pos_log_reg, open('pos_log_reg_dost.pkl', 'wb'))
#pkl.dump(neg_log_reg, open('neg_log_reg_dost.pkl', 'wb'))

In [17]:
inds = list(training_data.index)
np.random.shuffle(inds)
train_inds = inds[:int(len(inds) * 0.8)]
test_inds = inds[int(len(inds) * 0.8):]

In [18]:
data_train = training_data[training_data.index.isin(train_inds)]
data_test = training_data[training_data.index.isin(test_inds)]

In [19]:
pos_targets_train = data_train.pos_target#np.array(data.label == 'positive').astype(np.int32)
neg_targets_train = data_train.neg_target

pos_targets_test = data_test.pos_target#np.array(data.label == 'positive').astype(np.int32)
neg_targets_test = data_test.neg_target

In [22]:
embeddings_train = preprocess2(data_train.message.values, use_dost=True)
embeddings_test = preprocess2(data_test.message.values, use_dost=True)

Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)




In [23]:
embeddings_train = np.concatenate((embeddings_train,
                                      data_train.emotional_column.values.reshape(-1, 1)), axis=1)
embeddings_test = np.concatenate((embeddings_test,
                                      data_test.emotional_column.values.reshape(-1, 1)), axis=1)

In [24]:
pos_log_reg = LogisticRegression()
neg_log_reg = LogisticRegression()

In [25]:
X_train, y_train_pos, y_train_neg =  embeddings_train, pos_targets_train, neg_targets_train
X_test, y_test_pos, y_test_neg = embeddings_test, pos_targets_test, neg_targets_test


In [26]:
pos_log_reg.fit(X_train, y_train_pos)
neg_log_reg.fit(X_train, y_train_neg)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
print('Positive train accuracy: {}'.format(pos_log_reg.score(X_train, y_train_pos)))
print('Positive test accuracy: {}'.format(pos_log_reg.score(X_test, y_test_pos)))
print()
print('Negative train accuracy: {}'.format(neg_log_reg.score(X_train, y_train_neg)))
print('Negative test accuracy: {}'.format(neg_log_reg.score(X_test, y_test_neg)))

Positive train accuracy: 0.9841030663753485
Positive test accuracy: 0.980864848576164

Negative train accuracy: 0.9991712499058238
Negative test accuracy: 0.9966852493596504


In [87]:
preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg.predict_proba(X_test)))
preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg.predict_proba(X_test)))

In [88]:
data_test['preds_pos'] = preds_pos
data_test['preds_neg'] = preds_neg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [91]:
data_test[['message', 'preds_pos', 'preds_neg']].sort_values(by='preds_pos', ascending=False).sample(100)

Unnamed: 0,message,preds_pos,preds_neg
17374,–õ–∏–ª–∏—è,0.004,0.003
32099,"–ù–∞–ø–∏—à–∏, –∫–∞–∫ –ø—Ä–∏–¥—É—Ç",0.006,0.000
12180,–†–∂–µ–≤—Å–∫–∏–π,0.000,0.001
33100,–¥–æ–ª–≥,0.003,0.000
7298,–ú–∞—Ä–∫—É—Ö–∏–Ω –ü–∞–≤–µ–ª –ê–ª–µ–∫—Å–∞–Ω–¥—Ä–æ–≤–∏—á,0.000,0.000
12937,—é—Ä–∞—à–∏–∫,0.005,0.000
32019,–ú–∞—Ü—É–∫–æ–≤–∞ –î–∞—Ä—å—è,0.001,0.000
32029,–£ –ú–µ–Ω—è –û—Å—Ç–∞–ª–æ—Å—å -–∞.—Ä—É–±–ª—è.,0.010,0.000
8007,–û—Ç –ï–ª–µ–Ω—ã –Æ,0.003,0.000
1617,–ò—Ä–∏–Ω–∞,0.000,0.000
