In [175]:
%%writefile sentiment.py
import pandas as pd
import re
import pymorphy2
import numpy as np
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.wrappers.fasttext import FastTextKeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle as pkl
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
import datetime

        
tokenizer = RegexTokenizer()
model = FastTextSocialNetworkModel(tokenizer=tokenizer)

# morph analyzer for text lemmatization
morph = pymorphy2.MorphAnalyzer()
fasttext = FastTextKeyedVectors.load('187/model.model')
pos_log_reg = pkl.load(open('pos_log_reg.pkl', 'rb'))
neg_log_reg = pkl.load(open('neg_log_reg.pkl', 'rb'))
pos_log_reg_dost = pkl.load(open('pos_log_reg_dost.pkl', 'rb'))
neg_log_reg_dost = pkl.load(open('neg_log_reg_dost.pkl', 'rb'))

old_data = pd.read_pickle('data/new_data.pkl')
old_data['index'] = old_data.index
training_data = pd.read_csv('data/training_data_with_razmetka_final.csv')

data_new = training_data.merge(old_data, on=['index','message'])
cut_date = lambda x: datetime.date(x.year, x.month, x.day)
data_new['local_datetime'] = pd.to_datetime(data_new.local_datetime).apply(cut_date)

# function for performing parallel computing on cpu
def parallelization(func, massive, jobs=None, tq=True):
    num_cores = multiprocessing.cpu_count() if jobs is None else jobs
    if tq:
        results = np.array(Parallel(n_jobs=num_cores)(delayed(func)(i) for i in tqdm(massive)))
        return results
    else:
        results = Parallel(n_jobs=num_cores)(delayed(func)(i) for i in massive)
        return results


def _word2canonical4w2v(word):
    elems = morph.parse(word)
    my_tag = ''
    res = []
    for elem in elems:
        if 'VERB' in elem.tag or 'GRND' in elem.tag or 'INFN' in elem.tag:
            my_tag = 'V'
        if 'NOUN' in elem.tag:
            my_tag = 'S'
        normalised = elem.normalized.word
        res.append((normalised, my_tag))
    tmp = list(filter(lambda x: x[1] != '', res))
    if len(tmp) > 0:
        return tmp[0]
    else:
        return res[0]


def word2canonical(word):
    return _word2canonical4w2v(word)[0]


def get_words(text, filter_short_words=False):
    if filter_short_words:
        return filter(lambda x: len(x) > 2, re.findall('[а-яА-Яa-zA-Z]+', text))#re.findall(r'(?u)\w+', text))
    else:
        return re.findall(r'(?u)\w+', text)

def text2canonicals(text, add_word=False, filter_short_words=True):
    words = []
    for word in get_words(text, filter_short_words=filter_short_words):
        words.append(word2canonical(word.lower()))
        if add_word:
            words.append(word.lower())
    return words


def get_text_vectors(text):
    matrix = np.zeros((len(text), 300))
    for i,word in enumerate(text):
        vector = fasttext[word]
        matrix[i] = vector
        
    return matrix

def get_dost_vector(pred):
    return np.array([pred['positive'], pred['skip'], pred['speech'], pred['neutral'], pred['positive']])


def preprocess(texts):
    # embedding vectors weighted with tfidf
    preprocessed_texts = parallelization(text2canonicals, texts)
    lengths = np.array(list(map(lambda x: len(x) if len(x) > 0 else 1, preprocessed_texts)))
    
    texts = list(map(lambda x: ' '.join(x), preprocessed_texts))
    vectorizer = TfidfVectorizer()
    tfifd_vectorized = vectorizer.fit_transform(texts).toarray()
    unique_words = list(map(lambda x: x[0], sorted(vectorizer.vocabulary_.items())))
    
    all_vectors = get_text_vectors(unique_words)
    weighted_embeddings = tfifd_vectorized @ all_vectors
    weighted_embeddings /= lengths.reshape(-1, 1)
    del tfifd_vectorized, all_vectors
    
    return weighted_embeddings


def preprocess2(texts, use_dost=False):
    # mean embedding vectors
    if use_dost:
        preds = model.predict(texts)
        dost_vectors = np.array(list(map(get_dost_vector, preds)))
    
    #preprocessed_texts = parallelization(text2canonicals, texts)
    preprocessed_texts = list(map(lambda x: x.lower().split(), texts))
    
    embeddings = np.zeros((len(texts), 300))
    for i, text in enumerate(preprocessed_texts):
        vectors = get_text_vectors(text)
        if vectors.shape[0] > 0:
            vector = np.mean(vectors, axis=0)
        else:
            vector = np.random.randn(300,)
        embeddings[i] = vector
        
    if use_dost:
        return np.concatenate((embeddings, dost_vectors), axis=1)
    return embeddings


def emotional(x):
    emotions = {')', '!', '('}
    for e in emotions:
        if e in x:
            return 1
    return 0


def sentiment_analysis(sentences, use_dost=True):
    emotional_col = list(map(lambda x: emotional(x), sentences))
    prep_sent = preprocess2(sentences, True)
    inputs = np.concatenate((prep_sent, np.array(emotional_col).reshape(-1, 1)), axis=1)
    if use_dost:
        preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg_dost.predict_proba(inputs)))
        preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg_dost.predict_proba(inputs)))
    else:
        preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg.predict_proba(inputs)))
        preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg.predict_proba(inputs)))
    
    return np.array([preds_pos, preds_neg]).T


def get_date_list(numdays, base):
    date_list = [base - datetime.timedelta(days=x) for x in range(numdays)]
    return date_list


def sent_analyse_dates(period, base=datetime.date(2019, 9, 3)):
    if period == 'week':
        date_list = get_date_list(7, base)
        data_period = data_new[data_new.local_datetime.isin(date_list)]

    elif period == 'month':
        date_list = get_date_list(31, base)
        data_period = data_new[data_new.local_datetime.isin(date_list)]

    elif period == 'all':
        data_period = data_new
    else:
        raise ValueError('invalid period name')
        
    predictions = sentiment_analysis(data_period.message.values)
    return np.mean(predictions, axis=0)

Overwriting sentiment.py


In [28]:
#pkl.dump(pos_log_reg, open('pos_log_reg_dost.pkl', 'wb'))
#pkl.dump(neg_log_reg, open('neg_log_reg_dost.pkl', 'wb'))

In [2]:
sentiment_analysis(['членосос ты ебаный', 'любовь моя'])

array([[0.002, 0.974],
       [0.998, 0.001]])

In [1]:
from sentiment import sent_analyse_dates

In [2]:
preds = sent_analyse_dates('all')

In [3]:
preds

array([0.11251285, 0.00827071])

In [17]:
inds = list(training_data.index)
np.random.shuffle(inds)
train_inds = inds[:int(len(inds) * 0.8)]
test_inds = inds[int(len(inds) * 0.8):]

In [18]:
data_train = training_data[training_data.index.isin(train_inds)]
data_test = training_data[training_data.index.isin(test_inds)]

In [19]:
pos_targets_train = data_train.pos_target#np.array(data.label == 'positive').astype(np.int32)
neg_targets_train = data_train.neg_target

pos_targets_test = data_test.pos_target#np.array(data.label == 'positive').astype(np.int32)
neg_targets_test = data_test.neg_target

In [22]:
embeddings_train = preprocess2(data_train.message.values, use_dost=True)
embeddings_test = preprocess2(data_test.message.values, use_dost=True)

Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)




In [23]:
embeddings_train = np.concatenate((embeddings_train,
                                      data_train.emotional_column.values.reshape(-1, 1)), axis=1)
embeddings_test = np.concatenate((embeddings_test,
                                      data_test.emotional_column.values.reshape(-1, 1)), axis=1)

In [24]:
pos_log_reg = LogisticRegression()
neg_log_reg = LogisticRegression()

In [25]:
X_train, y_train_pos, y_train_neg =  embeddings_train, pos_targets_train, neg_targets_train
X_test, y_test_pos, y_test_neg = embeddings_test, pos_targets_test, neg_targets_test


In [26]:
pos_log_reg.fit(X_train, y_train_pos)
neg_log_reg.fit(X_train, y_train_neg)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
print('Positive train accuracy: {}'.format(pos_log_reg.score(X_train, y_train_pos)))
print('Positive test accuracy: {}'.format(pos_log_reg.score(X_test, y_test_pos)))
print()
print('Negative train accuracy: {}'.format(neg_log_reg.score(X_train, y_train_neg)))
print('Negative test accuracy: {}'.format(neg_log_reg.score(X_test, y_test_neg)))

Positive train accuracy: 0.9841030663753485
Positive test accuracy: 0.980864848576164

Negative train accuracy: 0.9991712499058238
Negative test accuracy: 0.9966852493596504


In [87]:
preds_pos = list(map(lambda x: np.round(x[1], 3), pos_log_reg.predict_proba(X_test)))
preds_neg = list(map(lambda x: np.round(x[1], 3), neg_log_reg.predict_proba(X_test)))

In [88]:
data_test['preds_pos'] = preds_pos
data_test['preds_neg'] = preds_neg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [91]:
data_test[['message', 'preds_pos', 'preds_neg']].sort_values(by='preds_pos', ascending=False).sample(100)

Unnamed: 0,message,preds_pos,preds_neg
17374,Лилия,0.004,0.003
32099,"Напиши, как придут",0.006,0.000
12180,Ржевский,0.000,0.001
33100,долг,0.003,0.000
7298,Маркухин Павел Александрович,0.000,0.000
12937,юрашик,0.005,0.000
32019,Мацукова Дарья,0.001,0.000
32029,У Меня Осталось -а.рубля.,0.010,0.000
8007,От Елены Ю,0.003,0.000
1617,Ирина,0.000,0.000


# Make predictions for our data

In [14]:
#import pickle as pkl
#data_new = pkl.load(open('data.pkl', 'rb'))

In [434]:
# msg = df[(df.target == 0)].message.values


# unique = np.unique(list(map(lambda x: x.lower(), msg)))

# np.save('negative.npy', unique)

# out = parallelization(text2canonicals, unique)

# all_positive = [x for lst in out for x in lst]

# from collections import Counter
# from nltk.corpus import stopwords

# russian_stopwords = set(stopwords.words('russian'))

# def good_tag(word):
#     elem = morph.parse(word)[0]
#     my_tag = ''
#     res = []
#     lst = ['VERB','GRND', 'INFN','NOUN','ADVB', 'PRTS','PRTF', 'COMP', 'PRED']

#     for tag in lst:
#         if elem.tag.POS is not None and elem.tag.POS in lst:
#             return True
#     return False


# apos = list(filter(lambda x: x not in russian_stopwords and good_tag(x), all_positive))

# cntr = Counter(apos)

In [363]:
#pkl.dump(cntr.most_common(1000), open('most_common_positive.pkl', 'wb'))

In [214]:
message_embeddings = preprocess2(df.message.values)

In [215]:
message_embeddings = np.concatenate((message_embeddings,
                                     df.emotional_column.values.reshape(-1, 1)), axis=1)

In [216]:
pos_preditctions = pos_log_reg.predict_proba(message_embeddings)
neg_predictions = neg_log_reg.predict_proba(message_embeddings)

In [218]:
pos_col = list(map(lambda x: x[1], pos_preditctions))
neg_col = list(map(lambda x: x[1], neg_preditctions))
df['pos_preds'] = pos_col
df['neg_preds'] = neg_col

In [238]:
df[['message', 'pos_preds', 'neg_preds']]#.sort_values('predictions', ascending=False)

Unnamed: 0,message,predictions
51782,Дарю,0.962121
57679,Мама,0.978279
35701,Сын,0.998721
51237,Шампунь,0.939198
33592,Здорово,0.999097
59411,Роллы,0.964916
39541,Тане,0.962099
41372,Ура,0.999857
32797,Прекрасная Встреча,0.969075
46082,Сын,0.998721
