# План

* Вытащить данные из файла
* Достать фичи
* Разделить данные на train и test set
* Отклассифицировать

### Твит

In [1]:
position_from_emotion = {}
position_from_emotion['anger'] = [1, 0, 0, 0]
position_from_emotion['sadness'] = [0, 1, 0, 0]
position_from_emotion['joy'] = [0, 0, 1, 0]
position_from_emotion['fear'] = [0, 0, 0, 1]

cl_from_emotion = {}
cl_from_emotion['anger'] = 0
cl_from_emotion['sadness'] = 1
cl_from_emotion['joy'] = 2
cl_from_emotion['fear'] = 3


class Tweet(object):
    def __init__(self, message, res, common_class):
        self.cl = cl_from_emotion[common_class]
        self.message = message
        self.res = [x * res for x in position_from_emotion[common_class]]

    def __str__(self):
        return str(self.message) + " " + str(self.res)


def get_tweet(str_tweet, res_acc=1):
    num, message, common_class, res = str_tweet.split('\t')
    return Tweet(message, float(res[0:res_acc]), common_class)


def get_tweets(str_tweets, res_acc=1):
    return [get_tweet(line, res_acc) for line in str_tweets.split('\n') if len(line) > 0]

### Считываем твиты

In [2]:
EMOTION = 'anger'
FILENAME = 'main_data/EI-reg-en_' + EMOTION + '_train.txt'

file = open(FILENAME, 'r')
tweets = get_tweets(file.read(), res_acc=5)

In [3]:
import pandas as pd
from collections import namedtuple
import resource
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
    
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def normalize_text(text):
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()   

In [4]:
import numpy as np

tweets = np.random.permutation(tweets)

In [5]:
texts = [normalize_text(t.message) for t in tweets]

### Получаем модель doc2vec

In [6]:
import gensim
from gensim.models import Doc2Vec
import gensim.models.doc2vec
import multiprocessing

TRAIN_2VEC = False
ADD_TRAIN = False

In [7]:
model = None
Doc = namedtuple('Doc', 'words tags')
docs = []
 
i = 0    
if TRAIN_2VEC or ADD_TRAIN:
    data = pd.read_csv('tweet_corpus.csv', encoding = 'ISO-8859-1', index_col=0, parse_dates=True,
                      names=['target', 'ids', 'date', 'flag', 'user', 'text'])
    df = data.iloc[:, 4]
        
    for row in df.values:
        docs.append(Doc(normalize_text(row).split(), [i]))
        i += 1
        if i % 20000 == 0:
            print(str(i) + " docs processed")


In [8]:
if TRAIN_2VEC or ADD_TRAIN:
    for text in texts:
        docs.append(Doc(text.split(), [i]))
        i += 1

In [9]:
if TRAIN_2VEC:        
    cores = multiprocessing.cpu_count()

    model = Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, min_count=2, workers=cores * 2)
    
    model.build_vocab(docs)

In [10]:
if TRAIN_2VEC:
    for ep in range(100):
        docs_slice = docs[ep * int(model.corpus_count / 100) : (ep + 1) * int(model.corpus_count / 100)] 
        model.train(docs_slice, total_examples=model.corpus_count / 100, epochs=3)
        print('ep ' + str(ep) + ' finished')
    model.save('doc2vec_model.doc2vec')
else:
    model = Doc2Vec.load('doc2vec_model_v2.doc2vec')

In [11]:
if ADD_TRAIN:
    for ep in range(10): 
        model.train(docs, total_examples=model.corpus_count, epochs=10)
        print('ep ' + str(ep) + ' finished')
    model.save('doc2vec_model_v2.doc2vec')
else:
    model.save('doc2vec_model.doc2vec')

## Решаем как обычную задачу регрессии

### Достаем фичи

In [12]:
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Xd2v = np.array([(model.infer_vector(x.split())) for x in texts])
# Xd2v = Xd2v - np.amin(Xd2v)
# Xd2v = Xd2v / (0.1 * np.linalg.norm(Xd2v))

In [13]:
# Xcv = TfidfVectorizer().fit_transform(texts).toarray()
# X = np.array([list(xd2v) + list(Xcv[i]) for i, xd2v in enumerate(list(Xd2v))])

In [14]:
#X = Xcv
# Y = [sum(t.res) for t in tweets]

### Делим на train и test

In [15]:
# split_edge = int(0.8 * len(tweets)) 

# X_train = X[0:split_edge]
# X_test  = X[split_edge:]
# y_train = Y[0:split_edge]
# y_test  = Y[split_edge:]

### Решаем задачу регрессии

In [16]:
# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Perceptron
# from sklearn.linear_model import SGDRegressor

# clf = SGDRegressor(max_iter=20)

# clf.fit(np.array(X_train), np.array(y_train))

In [17]:
# preds = clf.predict(X_test)

In [18]:
# from scipy.stats import pearsonr
# print(pearsonr(y_test, preds)[0])

## Решаем через нейронную сеть

### Достаем все чистые данные

In [19]:
EMOTIONS = ['anger']

tweets = []
for emotion in EMOTIONS:
    filename = 'main_data/EI-reg-en_' + emotion + '_train.txt'
    file = open(filename, 'r')
    tweets = tweets + get_tweets(file.read(), res_acc=5)

In [20]:
tweets = np.random.permutation(tweets)

### Достаем грязные данные

In [21]:
dirty_file = open('dirty_data.txt', 'r')
dirty_tweets = get_tweets(dirty_file.read(), res_acc=5)

In [22]:
common_tweets = dirty_tweets + list(tweets)
dirty_edge = len(dirty_tweets)

common_texts = [normalize_text(t.message) for t in common_tweets]

### Достаем фичи

In [236]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

SEQ = True

if not SEQ:
    Xd2v = np.array([(model.infer_vector(x.split())) for x in common_texts])
    Xd2v = Xd2v - np.amin(Xd2v)
    Xd2v = Xd2v / (0.1 * np.linalg.norm(Xd2v))
    Xcv = TfidfVectorizer(min_df=0.0005).fit_transform(common_texts).toarray()
    X = np.array([list(xd2v) + list(Xcv[i]) for i, xd2v in enumerate(list(Xd2v))])
else:
    max_features = 2000
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(common_texts)
    X = tokenizer.texts_to_sequences(common_texts)
    X = pad_sequences(X)

In [237]:
#X = Xcv
Y = np.array([t.res for t in common_tweets])

### Делим на train и test

In [238]:
dirty_X = X[0:dirty_edge]
dirty_Y = Y[0:dirty_edge] * 0.5
clean_X = X[dirty_edge:]
clean_Y = Y[dirty_edge:]

In [239]:
dirty_Y = np.concatenate((np.zeros((dirty_Y.shape[0], 4)),dirty_Y), axis=1)
clean_Y = np.concatenate((clean_Y, np.zeros((clean_Y.shape[0], 4))), axis=1)

In [240]:
split_edge = int(0.9 * len(clean_X)) 

X_train = clean_X[0:split_edge]
X_test  = clean_X[split_edge:]
y_train = clean_Y[0:split_edge]
y_test  = clean_Y[split_edge:]
test_category = [t.cl for t in tweets[split_edge:]]

In [245]:
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import GaussianNoise, SimpleRNN, LSTM, Reshape, Embedding, SpatialDropout1D

In [263]:
from keras import backend as K
import tensorflow as tf

GPU = False
num_cores = 4

if GPU:
    num_GPU = 1
    num_CPU = 1
else:
    num_CPU = 1
    num_GPU = 0

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
session = tf.Session(config=config)
K.set_session(session)

def pearson_correlation_f(y_true, y_pred):
    fsp = y_pred - K.mean(y_pred,axis=-1,keepdims=True)
    fst = y_true - K.mean(y_true,axis=-1, keepdims=True)

    devP = K.std(y_pred)
    devT = K.std(y_true)
    
    val = K.mean(fsp*fst)/(devP*devT)
    
    return 1 - val

In [273]:
LOAD = False
neural_model = None

if LOAD:
    neural_model = load_model('NN.h5')
else:
    neural_model = Sequential()
    neural_model.add(Embedding(max_features, 256, input_length = X.shape[1]))
    neural_model.add(SpatialDropout1D(0.4))
    neural_model.add(LSTM(1024, dropout=0.2, recurrent_dropout=0.2))
    neural_model.add(Dense(256, kernel_initializer='uniform', activation='relu'))
    neural_model.add(GaussianNoise(stddev=0.05))
    neural_model.add(Dense(32, kernel_initializer='uniform', activation='relu'))
    neural_model.add(Dense(8, kernel_initializer='uniform', activation='softmax'))

    neural_model.compile(loss=pearson_correlation_f, optimizer='adam')

In [275]:
if not LOAD:
    neural_model.fit(dirty_X[0:1000], dirty_Y[0:1000], epochs=10, batch_size=150, verbose=2)

Epoch 1/10
 - 13s - loss: 0.6197
Epoch 2/10
 - 13s - loss: 0.5211
Epoch 3/10
 - 13s - loss: 0.5199
Epoch 4/10
 - 13s - loss: 0.5197
Epoch 5/10
 - 13s - loss: 0.5156
Epoch 6/10
 - 13s - loss: 0.5154
Epoch 7/10
 - 13s - loss: 0.5143
Epoch 8/10
 - 13s - loss: 0.5142
Epoch 9/10
 - 13s - loss: 0.5132
Epoch 10/10
 - 13s - loss: 0.5121


In [276]:
if not LOAD:
    neural_model.fit(X_train, y_train, epochs=22, batch_size=25, verbose=2)

Epoch 1/22
 - 40s - loss: 0.3318
Epoch 2/22


KeyboardInterrupt: 

In [None]:
neural_model.save("NN.h5")

In [None]:
predictions = neural_model.predict(X_test)

In [None]:
preds = [pr[test_category[i]] for i, pr in enumerate(predictions)]
results = [r[test_category[i]] for i, r in enumerate(y_test)]
print(preds[0:3])
print(results[0:3])

In [None]:
from scipy.stats import pearsonr
print(pearsonr(results, preds)[0])