# Emotion detection from tweets

In [1]:
import numpy as np
import pandas as pd
from feature_extraction import FeatureExtractionContextValenceShifting
from feature_selection import generate_initial_features, feature_selection 
from preprocessing import fix_encoding, split_tweet_sentences, tokenize_tweets, get_lemmas

### 1. Load and preprocess dataset

Load the dataset contaning the tweets and their emotions. For text preprocessing first the tweet text is encoded (fixed), then the tweet is tokenized, and finally the tokens are lematized. The dataset needs to have a column with name tweet.

In [2]:
dataset = pd.read_excel('data/merged_datasets.xlsx')
# fix the tweet text
dataset = fix_encoding(dataset)
# split the tweet text into sentences
dataset = split_tweet_sentences(dataset)
# tokenize each sentence of the tweets
dataset = tokenize_tweets(dataset)
# lemmatise the tweets
dataset = get_lemmas(dataset)

dataset = dataset.drop(['emotion_intensity', 'tweet'], axis=1)
print(dataset.head(10))

        id    class                                             tokens  \
0  1000001  neutral  [[Check, this, video, out, -, -, President, Ob...   
1  1000002  neutral  [[need, suggestions, for, a, good, IR, filter,...   
2  1000003  neutral  [[@surfit, :, I, just, checked, my, google, fo...   
3  1000004  neutral  [[is, in, San, Francisco, at, Bay, to, Breaker...   
4  1000005  neutral               [[just, landed, at, San, Francisco]]   
5  1000006  neutral  [[San, Francisco, today, .], [Any, suggestions...   
6  1000007  neutral  [[On, my, way, to, see, Star, Trek, @, The, Es...   
7  1000008  neutral  [[Going, to, see, star, trek, soon, with, my, ...   
8  1000009  neutral  [[Bill, Simmons, in, conversation, with, Malco...   
9  1000010  neutral    [[playing, with, cURL, and, the, Twitter, API]]   

                                              lemmas  
0  [[Check, this, video, out, -, -, President, Ob...  
1  [[need, suggestion, for, a, good, IR, filter, ...  
2  [[@user, :, I, ju

### 2. Load lexicon
Load the Warriner et al. lexicon and retain valence and arousal dimensions

In [3]:
lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0)
lexicon.columns = ['word', 'valence', 'arousal']
print(lexicon.head())

          word  valence  arousal
1     aardvark     6.26     2.41
2      abalone     5.30     2.65
3      abandon     2.84     3.73
4  abandonment     2.63     4.95
5        abbey     5.85     2.20


### 3. Feature extraction

In [4]:
path_to_jar = 'stanford_parser/stanford-parser.jar'
path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar'
valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon)

In [5]:
# set intial valences from lexicon
dataset = valence_shifter.get_initial_valences(dataset)
print(dataset.head(10))

        id    class                                             tokens  \
0  1000001  neutral  [[Check, this, video, out, -, -, President, Ob...   
1  1000002  neutral  [[need, suggestions, for, a, good, IR, filter,...   
2  1000003  neutral  [[@surfit, :, I, just, checked, my, google, fo...   
3  1000004  neutral  [[is, in, San, Francisco, at, Bay, to, Breaker...   
4  1000005  neutral               [[just, landed, at, San, Francisco]]   
5  1000006  neutral  [[San, Francisco, today, .], [Any, suggestions...   
6  1000007  neutral  [[On, my, way, to, see, Star, Trek, @, The, Es...   
7  1000008  neutral  [[Going, to, see, star, trek, soon, with, my, ...   
8  1000009  neutral  [[Bill, Simmons, in, conversation, with, Malco...   
9  1000010  neutral    [[playing, with, cURL, and, the, Twitter, API]]   

                                              lemmas  \
0  [[Check, this, video, out, -, -, President, Ob...   
1  [[need, suggestion, for, a, good, IR, filter, ...   
2  [[@user, :, I,

### 4. Feature selection

In [6]:
featured_dataset, vocab = generate_initial_features(dataset)
print(featured_dataset.head(10))

        id    class                                             lemmas  \
0  1000001  neutral  [Check, this, video, out, -, -, President, Oba...   
1  1000002  neutral  [need, suggestion, for, a, good, IR, filter, f...   
2  1000003  neutral  [@user, :, I, just, check, my, google, for, my...   
3  1000004  neutral  [be, in, San, Francisco, at, Bay, to, Breakers...   
4  1000005  neutral                   [just, land, at, San, Francisco]   
5  1000006  neutral     [San, Francisco, today, ., Any, suggestion, ?]   
6  1000007  neutral  [On, my, way, to, see, Star, Trek, @, The, Esq...   
7  1000008  neutral  [Going, to, see, star, trek, soon, with, my, d...   
8  1000009  neutral  [Bill, Simmons, in, conversation, with, Malcol...   
9  1000010  neutral         [play, with, cURL, and, the, Twitter, API]   

                                            valences  
0  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....  
1  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....  
2  [-1.0, -1.0, -1.0

In [7]:
X = featured_dataset['valences'].values.tolist()
y = featured_dataset['class'].values
y[y == 'neutral'] = 'n'
y[y == 'fear'] = 'f'
y[y == 'anger'] = 'a'
y[y == 'sadness'] = 's'
y[y == 'joy'] = 'j'
selected, mask = feature_selection(X, y, vocab)

feature_selection
[('happy', 0.013070030909320362), ('smile', 0.01057974620608353), ('angry', 0.009554506156129681), ('hilarious', 0.009313254163633882), ('bitter', 0.008905144468520258), ('depression', 0.008538926516890795), ('rage', 0.008035108292014218), ('offend', 0.007811266868194009), ('sadness', 0.007451892855713847), ('anger', 0.007440070106215409), ('nervous', 0.007397450499903377), ('sad', 0.007279418311305791), ('revenge', 0.007050725229037773), ('panic', 0.006867959371935231), ('unhappy', 0.006746363371559505), ('cheer', 0.006541960022363224), ('bully', 0.006269300821539021), ('outrage', 0.006076459154610644), ('terror', 0.006031467113356563), ('laughter', 0.005989333044401438), ('terrorism', 0.005978863447635438), ('anxiety', 0.005785289543543587), ('awful', 0.005615627287863572), ('lively', 0.0054229677717405785), ('nightmare', 0.005353371680887192), ('shocking', 0.005081262377614274), ('hilarity', 0.004937975917861861), ('grim', 0.004914686918869022), ('rejoice', 0.00474

In [8]:
for index, row in featured_dataset.iterrows():
    valences = np.array(row.valences[mask])
    featured_dataset.set_value(index=index, col='valences', value=valences)
print(featured_dataset.head(10))

        id class                                             lemmas  \
0  1000001     n  [Check, this, video, out, -, -, President, Oba...   
1  1000002     n  [need, suggestion, for, a, good, IR, filter, f...   
2  1000003     n  [@user, :, I, just, check, my, google, for, my...   
3  1000004     n  [be, in, San, Francisco, at, Bay, to, Breakers...   
4  1000005     n                   [just, land, at, San, Francisco]   
5  1000006     n     [San, Francisco, today, ., Any, suggestion, ?]   
6  1000007     n  [On, my, way, to, see, Star, Trek, @, The, Esq...   
7  1000008     n  [Going, to, see, star, trek, soon, with, my, d...   
8  1000009     n  [Bill, Simmons, in, conversation, with, Malcol...   
9  1000010     n         [play, with, cURL, and, the, Twitter, API]   

                                            valences  
0  [-1.0, -1.0, -1.0, 10.0, -1.0, -1.0, -1.0, -1....  
1  [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....  
2  [-1.0, 10.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.... 

In [9]:
temp = pd.DataFrame(np.vstack(featured_dataset.valences.values))
print(np.vstack(featured_dataset.valences.values).shape)
temp.columns = ['v_' + str(i) for i in range(len(selected))]
temp['emotion'] = featured_dataset['class'].values
print(temp.head(5))
temp.to_csv('data_final/features_emotion_detection.csv', index=False)

(5348, 501)
   v_0   v_1  v_2   v_3  v_4  v_5  v_6  v_7   v_8   v_9   ...     v_492  \
0 -1.0  -1.0 -1.0  10.0 -1.0 -1.0 -1.0 -1.0  10.0  -1.0   ...      -1.0   
1 -1.0  -1.0 -1.0  -1.0 -1.0 -1.0 -1.0 -1.0  -1.0  10.0   ...      -1.0   
2 -1.0  10.0 -1.0  -1.0 -1.0 -1.0 -1.0 -1.0  10.0  10.0   ...      -1.0   
3 -1.0  -1.0 -1.0  -1.0 -1.0 -1.0 -1.0 -1.0  -1.0  10.0   ...      -1.0   
4 -1.0  -1.0 -1.0  -1.0 -1.0 -1.0 -1.0 -1.0  -1.0  -1.0   ...      -1.0   

   v_493  v_494  v_495  v_496  v_497  v_498  v_499  v_500  emotion  
0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0        n  
1   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0        n  
2   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0        n  
3   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0        n  
4   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0   -1.0        n  

[5 rows x 502 columns]
