# Emotion detection from tweets (lexicon + word2vec)

In [1]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from feature_extraction import FeatureExtractionContextValenceShifting
from feature_selection import generate_initial_features, feature_selection 
from preprocessing import fix_encoding, split_tweet_sentences, tokenize_tweets, get_lemmas



### 1. Load and preprocess dataset

Load the dataset contaning the tweets and their emotions. For text preprocessing first the tweet text is encoded (fixed), then the tweet is tokenized, and finally the tokens are lematized. The dataset needs to have a column with name tweet.

In [2]:
dataset = pd.read_excel('data/merged_datasets.xlsx')
# fix the tweet text
dataset = fix_encoding(dataset)
# split the tweet text into sentences
dataset = split_tweet_sentences(dataset)
# tokenize each sentence of the tweets
dataset = tokenize_tweets(dataset)
# lemmatise the tweets
dataset = get_lemmas(dataset)

dataset = dataset.drop(['emotion_intensity', 'tweet'], axis=1)
print(dataset.head(10))

        id    class                                             tokens  \
0  1000001  neutral  [[Check, this, video, out, -, -, President, Ob...   
1  1000002  neutral  [[need, suggestions, for, a, good, IR, filter,...   
2  1000003  neutral  [[@surfit, :, I, just, checked, my, google, fo...   
3  1000004  neutral  [[is, in, San, Francisco, at, Bay, to, Breaker...   
4  1000005  neutral               [[just, landed, at, San, Francisco]]   
5  1000006  neutral  [[San, Francisco, today, .], [Any, suggestions...   
6  1000007  neutral  [[On, my, way, to, see, Star, Trek, @, The, Es...   
7  1000008  neutral  [[Going, to, see, star, trek, soon, with, my, ...   
8  1000009  neutral  [[Bill, Simmons, in, conversation, with, Malco...   
9  1000010  neutral    [[playing, with, cURL, and, the, Twitter, API]]   

                                              lemmas  
0  [[Check, this, video, out, -, -, President, Ob...  
1  [[need, suggestion, for, a, good, IR, filter, ...  
2  [[@user, :, I, ju

### 2. Load lexicon and word2vec model

In [3]:
lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0)
lexicon.columns = ['word', 'valence', 'arousal']
model = KeyedVectors.load_word2vec_format('glove.twitter.27B.200d.txt', binary=False)

### 3. Feature extraction

In [4]:
path_to_jar = 'stanford_parser/stanford-parser.jar'
path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar'
valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon, model)

In [None]:
# set intial valences from lexicon
dataset = valence_shifter.get_initial_valences(dataset)
print(dataset.head(10))

### 4. Feature selection

In [None]:
featured_dataset, vocab = generate_initial_features(dataset)
print(featured_dataset.head(10))

In [None]:
X = featured_dataset['valences'].values.tolist()
y = featured_dataset['class'].values
y[y == 'neutral'] = 'n'
y[y == 'fear'] = 'f'
y[y == 'anger'] = 'a'
y[y == 'sadness'] = 's'
y[y == 'joy'] = 'j'
selected, mask = feature_selection(X, y, vocab)

In [None]:
for index, row in featured_dataset.iterrows():
    valences = np.array(row.valences[mask])
    featured_dataset.set_value(index=index, col='valences', value=valences)
print(featured_dataset.head(10))

In [None]:
temp = pd.DataFrame(np.vstack(featured_dataset.valences.values))
print(np.vstack(featured_dataset.valences.values).shape)
temp.columns = ['v_' + str(i) for i in range(len(selected))]
temp['emotion'] = featured_dataset['class'].values
print(temp.head(5))
temp.to_csv('data_final/features_emotion_detection_w2v.csv', index=False)