# Emotion detection from tweets - tf-idf

In [16]:
import numpy as np
import pandas as pd
from itertools import chain
from feature_selection import feature_selection
from preprocessing import fix_encoding, split_tweet_sentences, tokenize_tweets, get_lemmas

### 1. Load and preprocess dataset

Load the dataset contaning the tweets and their emotions. For text preprocessing first the tweet text is encoded (fixed), then the tweet is tokenized, and finally the tokens are lematized. The dataset needs to have a column with name tweet.

In [6]:
dataset = pd.read_excel('data/merged_datasets.xlsx')
# fix the tweet text
dataset = fix_encoding(dataset)
# split the tweet text into sentences
dataset = split_tweet_sentences(dataset)
# tokenize each sentence of the tweets
dataset = tokenize_tweets(dataset)
# lemmatise the tweets
dataset = get_lemmas(dataset)

dataset = dataset.drop(['emotion_intensity', 'tweet'], axis=1)
print(dataset.head(10))

        id    class                                             tokens  \
0  1000001  neutral  [[Check, this, video, out, -, -, President, Ob...   
1  1000002  neutral  [[need, suggestions, for, a, good, IR, filter,...   
2  1000003  neutral  [[@surfit, :, I, just, checked, my, google, fo...   
3  1000004  neutral  [[is, in, San, Francisco, at, Bay, to, Breaker...   
4  1000005  neutral               [[just, landed, at, San, Francisco]]   
5  1000006  neutral  [[San, Francisco, today, .], [Any, suggestions...   
6  1000007  neutral  [[On, my, way, to, see, Star, Trek, @, The, Es...   
7  1000008  neutral  [[Going, to, see, star, trek, soon, with, my, ...   
8  1000009  neutral  [[Bill, Simmons, in, conversation, with, Malco...   
9  1000010  neutral    [[playing, with, cURL, and, the, Twitter, API]]   

                                              lemmas  
0  [[Check, this, video, out, -, -, President, Ob...  
1  [[need, suggestion, for, a, good, IR, filter, ...  
2  [[@user, :, I, ju

### 2. Feature extraction

In [14]:
vocab = [item.lower() for lemmas in dataset.lemmas.values for item in list(chain(*lemmas))]
vocab = np.unique(vocab)
# dictionary of every word in the vocabulary with its index
dictionary = dict()
for i in range(len(vocab)):
    dictionary[vocab[i]] = i

# create initial feature for classification
featured_dataset = dataset[['id', 'class', 'lemmas']]
featured_dataset['valences'] = ''
for index, row in dataset.iterrows():
    lemmas = list(chain(*row.lemmas))
    featured_dataset.set_value(index=index, col='lemmas', value=lemmas)
    valences = [1] * len(lemmas)
    featured_dataset.set_value(index=index, col='valences', value=valences)
    # 0 is the value when the word is not present
    initial_valences = np.zeros(len(vocab))
    for lemma, valence in zip(lemmas, valences):
        initial_valences[dictionary[lemma.lower()]] = valence
    featured_dataset.set_value(index=index, col='valences', value=initial_valences)
    
print(featured_dataset.head(10))

        id    class                                             lemmas  \
0  1000001  neutral  [Check, this, video, out, -, -, President, Oba...   
1  1000002  neutral  [need, suggestion, for, a, good, IR, filter, f...   
2  1000003  neutral  [@user, :, I, just, check, my, google, for, my...   
3  1000004  neutral  [be, in, San, Francisco, at, Bay, to, Breakers...   
4  1000005  neutral                   [just, land, at, San, Francisco]   
5  1000006  neutral     [San, Francisco, today, ., Any, suggestion, ?]   
6  1000007  neutral  [On, my, way, to, see, Star, Trek, @, The, Esq...   
7  1000008  neutral  [Going, to, see, star, trek, soon, with, my, d...   
8  1000009  neutral  [Bill, Simmons, in, conversation, with, Malcol...   
9  1000010  neutral         [play, with, cURL, and, the, Twitter, API]   

                                            valences  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [0.0, 0.0, 0.0, 0

### 4. Feature selection

In [17]:
X = featured_dataset['valences'].values.tolist()
y = featured_dataset['class'].values
y[y == 'neutral'] = 'n'
y[y == 'fear'] = 'f'
y[y == 'anger'] = 'a'
y[y == 'sadness'] = 's'
y[y == 'joy'] = 'j'
selected, mask = feature_selection(X, y, vocab)

feature_selection
[('happy', 0.012420160847448316), ('smile', 0.012400006270757793), ('nervous', 0.010819843777147073), ('sad', 0.010372171043425396), ('bitter', 0.009548802876973725), ('depression', 0.00934140622451959), ('hilarious', 0.008282177557744696), ('nightmare', 0.007844092835140917), ('rage', 0.007489818740668494), ('laughter', 0.0074166201571720545), ('panic', 0.0072438030206095055), ('terrorism', 0.007221643206650254), ('angry', 0.007216668028348058), ('revenge', 0.007016176816593769), ('anxiety', 0.006868576023274911), ('bully', 0.006685358814951264), ('unhappy', 0.006558597288690772), ('offend', 0.006509898152800927), ('optimism', 0.006413389736878103), ('sadness', 0.00630535784423931), ('fear', 0.00629811433904764), ('rejoice', 0.005901269943717621), ('cheer', 0.005897968471765254), ('outrage', 0.005855350256562923), ('terrible', 0.005648353344638163), ('terror', 0.005528074393716715), ('anger', 0.005343222679556044), ('lost', 0.005128141561090122), ('depressing', 0.005

In [18]:
for index, row in featured_dataset.iterrows():
    valences = np.array(row.valences[mask])
    featured_dataset.set_value(index=index, col='valences', value=valences)
print(featured_dataset.head(10))

        id class                                             lemmas  \
0  1000001     n  [Check, this, video, out, -, -, President, Oba...   
1  1000002     n  [need, suggestion, for, a, good, IR, filter, f...   
2  1000003     n  [@user, :, I, just, check, my, google, for, my...   
3  1000004     n  [be, in, San, Francisco, at, Bay, to, Breakers...   
4  1000005     n                   [just, land, at, San, Francisco]   
5  1000006     n     [San, Francisco, today, ., Any, suggestion, ?]   
6  1000007     n  [On, my, way, to, see, Star, Trek, @, The, Esq...   
7  1000008     n  [Going, to, see, star, trek, soon, with, my, d...   
8  1000009     n  [Bill, Simmons, in, conversation, with, Malcol...   
9  1000010     n         [play, with, cURL, and, the, Twitter, API]   

                                            valences  
0  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ... 

In [19]:
temp = pd.DataFrame(np.vstack(featured_dataset.valences.values))
print(np.vstack(featured_dataset.valences.values).shape)
temp.columns = ['v_' + str(i) for i in range(len(selected))]
temp['emotion'] = featured_dataset['class'].values
print(temp.head(5))
temp.to_csv('data_final/features_emotion_detection_tfidf.csv', index=False)

(5348, 502)
   v_0  v_1  v_2  v_3  v_4  v_5  v_6  v_7  v_8  v_9   ...     v_493  v_494  \
0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0   ...       0.0    0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0   ...       0.0    0.0   
2  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0   ...       0.0    0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0   ...       0.0    0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   ...       0.0    0.0   

   v_495  v_496  v_497  v_498  v_499  v_500  v_501  emotion  
0    0.0    0.0    0.0    0.0    0.0    0.0    0.0        n  
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0        n  
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0        n  
3    0.0    0.0    0.0    0.0    0.0    0.0    0.0        n  
4    0.0    0.0    0.0    0.0    0.0    0.0    0.0        n  

[5 rows x 503 columns]
