## Считываем твиты, выделяем фичи

1. Достаем чистые твиты
2. Достаем development твиты
3. Достаем грязные твиты
4. Получаем фичи
5. Сохраняем

In [1]:
from os import path, listdir
import numpy as np
import pandas as pd
import gensim
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import resource
from keras.models import Sequential, load_model, model_from_json
from keras.layers import Dense
from keras.layers import GaussianNoise, SimpleRNN, LSTM, Reshape, Embedding, SpatialDropout1D, GaussianDropout, Conv1D, GlobalMaxPooling1D, Flatten, MaxPooling1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
import tensorflow as tf
from scipy.stats import pearsonr
from sklearn.linear_model import SGDRegressor
from collections import namedtuple
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import pickle

import json
import random as rn

import os

Using TensorFlow backend.


In [2]:
GPU = True

if GPU:
    num_GPU = 1
    num_CPU = 1
else:
    num_CPU = 1
    num_GPU = 0

config = tf.ConfigProto(intra_op_parallelism_threads=4,\
        inter_op_parallelism_threads=4, allow_soft_placement=True,\
        device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
session = tf.Session(config=config)
K.set_session(session)

In [3]:
position_from_emotion = {}
position_from_emotion['anger'] = [1, 0, 0, 0]
position_from_emotion['sadness'] = [0, 1, 0, 0]
position_from_emotion['joy'] = [0, 0, 1, 0]
position_from_emotion['fear'] = [0, 0, 0, 1]

cl_from_emotion = {}
cl_from_emotion['anger'] = 0
cl_from_emotion['sadness'] = 1
cl_from_emotion['joy'] = 2
cl_from_emotion['fear'] = 3


class Tweet(object):
    def __init__(self, message, res, common_class):
        self.cl = cl_from_emotion[common_class]
        self.message = message
        self.res = [x * res for x in position_from_emotion[common_class]]

    def __str__(self):
        return str(self.message) + " " + str(self.res)


def get_tweet(str_tweet, res_acc=1):
    num, message, common_class, res = str_tweet.split('\t')
    if res == 'NONE':
        res = '1.000'
    return Tweet(message, float(res[0:res_acc]), common_class)
        


def get_tweets(str_tweets, res_acc=1):
    return [get_tweet(line, res_acc) for line in str_tweets.split('\n') if len(line) > 0]

In [4]:
def pearson_correlation_f(y_true, y_pred):
    fsp = y_pred - K.mean(y_pred,axis=-1,keepdims=True)
    fst = y_true - K.mean(y_true,axis=-1, keepdims=True)

    devP = K.std(y_pred)
    devT = K.std(y_true)
    
    val = K.mean(fsp*fst)/(devP*devT)
    
    return 1 - val

In [5]:
PARSE_SENT140 = False # True только если есть потребность поменять что-то 

def format_line(id, text, em):
    return (str(id) + '\t' + text.replace('\t', ' ') + '\t' + em + '\t1.000\n')

sent_to_emotion = {0: 'sadness', 2: 'no_emotion', 4: 'joy'}

if PARSE_SENT140:
    data = pd.read_csv('dirty_data/unlabeled/tweet_corpus.csv', encoding = 'ISO-8859-1', index_col=0, parse_dates=True,
                      names=['target', 'ids', 'date', 'flag', 'user', 'text'])
    i  = 0
    file = None
    for id, row in data.iterrows():
        if i % (data.shape[0] // 10) == 0:
            part_number = str(i // (data.shape[0] // 10))
            file = open('dirty_data/labeled/sent140part' + part_number, 'w+')
        emotion = sent_to_emotion[id]
        if emotion != 'no_emotion':
            line = format_line(row[0], row[4], emotion)
            file.write(line)
        i += 1  

In [6]:
EMOTIONS = ['anger', 'joy', 'sadness', 'fear']

def run_competition_files(path_pattern):
    em_tweets = {}
    for emotion in EMOTIONS:
        filename = path.join(path_pattern % emotion)
        file = open(filename, 'r')
        em_tweets[emotion] = get_tweets(file.read(), res_acc=5)
        file.close()
    return em_tweets
    
train_tweets = run_competition_files('train_data/EI-reg-en_%s_train.txt')
test_tweets = run_competition_files('development_data/2018-EI-reg-En-%s-dev.txt')

In [7]:
dirty_tweets =[]

directory = path.join('dirty_data/labeled')
for filename in os.listdir(directory):
    file = open(path.join(directory,filename), 'r')
    dirty_tweets += get_tweets(file.read(), res_acc=5)
    file.close()

In [8]:
EMOTION = 'joy'

tweets = np.array(list(dirty_tweets) + list(train_tweets[EMOTION]) + list(test_tweets[EMOTION]))

In [9]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
    
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def normalize_text(text):
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip() 

In [10]:
LOAD_TEXT = False

dirty_texts, train_texts, test_texts, texts = [], [], [], []
if not LOAD_TEXT:
    texts = [normalize_text(t.message) for t in tweets]
    dirty_texts = texts[0:len(dirty_tweets)]
    train_texts = texts[len(dirty_tweets): len(dirty_tweets) + len(train_tweets[EMOTION])]
    test_texts = texts[len(dirty_tweets) + len(train_tweets[EMOTION]):]
    
    assert (len(train_texts) == len(train_tweets[EMOTION]))
    assert (len(test_texts) == len(test_tweets[EMOTION]))
else:
    dirty_texts = list(np.loadtxt('features/dirty_texts.txt', dtype='str', delimiter='\n'))
    train_texts = list(np.loadtxt('features/train_texts_%s_.txt' % EMOTION, dtype='str', delimiter='\n'))
    test_texts = list(np.loadtxt('features/test_texts_%s_.txt' % EMOTION, dtype='str', delimiter='\n'))
    texts = dirty_texts + train_texts + test_texts
    
    assert (len(train_texts) == len(train_tweets[EMOTION]))
    assert (len(test_texts) == len(test_tweets[EMOTION]))    

In [11]:
s = len(dirty_tweets)
f = s + len(train_tweets[EMOTION])
e = f + len(test_tweets[EMOTION])

In [None]:
LOAD_D2V = False

Xd2v = {}

if not LOAD_D2V:
    model = Doc2Vec.load('doc2vec/doc2vec_model.doc2vec')
    named_texts = [('dirty', dirty_texts), ('train', train_texts), ('test', test_texts)]
    for name, text_list in named_texts:
        Xd2v[name] = np.array([(model.infer_vector(x.split())) for x in text_list])
        Xd2v[name] = Xd2v[name] - np.amin(Xd2v[name])
        Xd2v[name] = Xd2v[name] / (np.linalg.norm(Xd2v[name]))
    np.savetxt('features/dirty_Xd2v.txt', Xd2v['dirty']) 
    np.savetxt('features/train_Xd2v_%s.txt' % EMOTION, Xd2v['train'])
    np.savetxt('features/dirty_Xd2v_%s.txt' % EMOTION, Xd2v['test'])
else:
    Xd2v['dirty'] = np.loadtxt('features/dirty_Xd2v.txt')
    Xd2v['train'] = np.loadtxt('features/train_Xd2v_%s.txt' % EMOTION)
    Xd2v['test'] = np.loadtxt('features/dirty_Xd2v_%s.txt' % EMOTION)

In [None]:
resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

vectorizer = TfidfVectorizer(min_df=0.0005, max_features=2048)

Xcv = vectorizer.fit_transform(texts).toarray()
vectorizer = None

texts = texts[0:old_size]

In [None]:
all_Xd2v = np.vstack((Xd2v['dirty'],Xd2v['train'],Xd2v['test']))

batch_size = len(dirty_tweets)

dirty_X = np.hstack((Xcv[0:len(dirty_tweets)], all_Xd2v[0:len(dirty_tweets)]))

In [None]:
train_X = np.hstack((Xcv[s:f], all_Xd2v[s:f]))
test_X = np.hstack((Xcv[f:e], all_Xd2v[f:e]))

In [None]:
assert (len(train_X) == len(train_tweets[EMOTION]))
assert (len(test_X) == len(test_tweets[EMOTION]))

Xcv = None
Xd2v = None
np.savetxt('features/train_X_%s_.txt' % EMOTION, train_X)
np.savetxt('features/test_X_%s_.txt' % EMOTION, test_X)

In [12]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X)

dirty_X = X[0:s]
train_X = X[s:f]
test_X = X[f:e]

In [13]:
Y = np.array([t.res for t in tweets])

dirty_Y = Y[0:s]
train_Y = Y[s:f]
test_Y = Y[f:e]

In [14]:
placeholder_Y = np.hstack((np.ones((test_X.shape[0], 1)), np.zeros((test_X.shape[0], 3))))

In [15]:
Params = namedtuple('Params', 'layers loss optimizer dirty_e dirty_bs train_e train_bs')

In [16]:
def create_params(dirty_e, dirty_bs, train_e, train_bs, layers, optimizer='adam'):
    return Params(layers, pearson_correlation_f, optimizer, dirty_e, dirty_bs, train_e, train_bs)

In [17]:
def create_model(params):
    nm = Sequential()
    for layer in params.layers:
        nm.add(layer())
    nm.compile(loss='mean_squared_error', optimizer=params.optimizer)
    return nm

In [18]:
embeddings_dim = 300
embeddings = dict()
embeddings = KeyedVectors.load_word2vec_format( "twitter_sgns_subset.txt.gz" , binary=False ) 

In [19]:
embedding_weights = np.zeros((max_features , embeddings_dim ) )
for word,index in tokenizer.word_index.items():
    if index < max_features:
        try: embedding_weights[index,:] = embeddings[word]
        except: embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim )

In [33]:
np.random.seed(27)

params_list = []

perceptron_layers = []
perceptron_layers.append(lambda: Dense(1024, input_dim=train_X.shape[1], kernel_initializer='uniform', activation='relu'))
perceptron_layers.append(lambda: Dense(256, kernel_initializer='uniform', activation='relu'))
perceptron_layers.append(lambda: Dense(32, kernel_initializer='uniform', activation='relu'))
perceptron_layers.append(lambda: Dense(8, kernel_initializer='uniform', activation='softmax'))

lstm_layers = []
lstm_layers.append(lambda: Embedding(max_features, embeddings_dim, input_length = 52, weights=[embedding_weights]))
lstm_layers.append(lambda: Dropout(0.5))
lstm_layers.append(lambda: Conv1D(embeddings_dim, 3, activation='relu', padding='valid', strides=1))
lstm_layers.append(lambda: MaxPooling1D(pool_size=2))
lstm_layers.append(lambda: LSTM(embeddings_dim, dropout=0.5, recurrent_dropout=0.5))
lstm_layers.append(lambda: Dense(4, activation='sigmoid'))

params_list.append(create_params(15, 5000, 20, 16, lstm_layers))

In [34]:
max_pears = 0.64
diff = 0.001

for p in params_list:
    average = 0.0
    for i in range(1):
        neural_model = create_model(p)
        neural_model.fit(np.vstack((dirty_X)), \
                         np.vstack((dirty_Y)), \
                         epochs=p.dirty_e,\
                         batch_size=p.dirty_bs, verbose=2)
        neural_model.fit(train_X, train_Y, epochs=p.train_e, batch_size=p.train_bs)
        
        print('Attempt %i finished.' % (i + 1))
        
        predictions = neural_model.predict(test_X)
        preds = [pr[2] for pr in predictions]
        results = [r[2] for r in test_Y]
        
        pears = pearsonr(results, preds)[0]
        average += pears
        print(pears)
        
        if pears >= max_pears + 0.001:
            json_m = neural_model.to_json()
            json_name = 'best_model_architecture_%s.json' % EMOTION
            weights_name = 'best_model_weights_%s.h5' % EMOTION
            with open('tokenizer_%s.pickle' % EMOTION, 'wb') as handle:
                pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
            with open(json_name, 'w') as outfile:
                json.dump(json_m, outfile)
            neural_model.save_weights(weights_name)
            max_pears = pears

Epoch 1/15
 - 76s - loss: 0.0862
Epoch 2/15
 - 72s - loss: 0.0730
Epoch 3/15
 - 72s - loss: 0.0694
Epoch 4/15
 - 73s - loss: 0.0672
Epoch 5/15
 - 73s - loss: 0.0657
Epoch 6/15
 - 72s - loss: 0.0645
Epoch 7/15
 - 73s - loss: 0.0636
Epoch 8/15
 - 73s - loss: 0.0628
Epoch 9/15
 - 73s - loss: 0.0621
Epoch 10/15
 - 72s - loss: 0.0614
Epoch 11/15
 - 73s - loss: 0.0608
Epoch 12/15
 - 73s - loss: 0.0602
Epoch 13/15
 - 73s - loss: 0.0598
Epoch 14/15
 - 73s - loss: 0.0593
Epoch 15/15
 - 73s - loss: 0.0589
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Attempt 1 finished.
0.700869898528
