# Datasets

In [2]:
import pandas as pd
import time
import os
import operator
import sys
import re

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.neural_backend.load_data import get_data_sem_eval, get_data_sent_140

In [152]:
save_custom = False
sent_140_path = '../data/sent_140/training.1600000.processed.noemoticon.csv'
sem_eval_path = '../data/sem_eval/full/'
weather_path = '../data/weather/weather_emotion.csv'
text_emotion_path = '../data/text_emotion/text_emotion.csv'
san_analytics_path = '../data/sanders_analytics/full_corpus.csv'
custom = '../custom/sem_eval_balanced_with_sent_140.csv'

## Sent140 and SemEval

In [106]:
sent_140 = get_data_sent_140(sent_140_path, dataset_size=600000, shuffle=True)
sem_eval = get_data_sem_eval(sem_eval_path)

  mask |= (ar1 == a)


In [4]:
sent_140.head()

Unnamed: 0_level_0,class,date,query,user,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1693237746,positive,Sun May 03 21:57:04 PDT 2009,NO_QUERY,xofirefly,@devincastro no such thing as too much starbuc...
2184943548,positive,Mon Jun 15 16:39:01 PDT 2009,NO_QUERY,klovest70,playing with my 14 week old daughter
2257264825,negative,Sat Jun 20 14:03:39 PDT 2009,NO_QUERY,MarciaPCF,Watching Sanjaya on &quot;I'm a Celeb Get Me O...
2016182874,negative,Wed Jun 03 06:19:25 PDT 2009,NO_QUERY,candyLois,Wtf migrane! Seriously.
1981058075,positive,Sun May 31 08:19:10 PDT 2009,NO_QUERY,jetaimetoujours,@lbowe_elbow oh. well i &lt;3 you too


In [5]:
sem_eval.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
619969366986235905,neutral,Order Go Set a Watchman in store or through ou...
619971047195045888,negative,If these runway renovations at the airport pre...
619974445185302528,neutral,If you could ask an onstage interview question...
619987808317407232,positive,A portion of book sales from our Harper Lee/Go...


## Weather Emotion Dataset

In [20]:
weather_df = pd.read_csv(weather_path)
weather_df.head()

Unnamed: 0,_unit_id,_canary,_unit_state,_trusted_judgments,_last_judgment_at,what_emotion_does_the_author_express_specifically_about_the_weather,what_emotion_does_the_author_express_specifically_about_the_weather:confidence,gold_answer,tweet_id,tweet_text
0,314960380,,finalized,20,8/24/13 0:21,Positive,0.8439,,81990560,Grilling kabobs on the grill last night was am...
1,314960381,,finalized,20,8/24/13 0:49,Negative,0.6963,,84314377,The slowest day ever !! And the weather makes ...
2,314960382,,finalized,20,8/24/13 0:55,Neutral / author is just sharing information,0.8802,,82846118,Fire Weather Watch issued May 17 at 4:21PM CDT...
3,314960383,,finalized,20,8/24/13 0:48,Positive,0.6897,,82843785,Im going to lunch early today. The weather i...
4,314960384,,finalized,20,8/24/13 1:19,Neutral / author is just sharing information,0.6153,,82840144,Weekend Weather Causes Delays In I-270 Bridge ...


## Text Emotion Dataset

In [122]:
text_emotion = pd.read_csv(text_emotion_path)
text_emotion = text_emotion.drop(labels='author', axis=1)
text_emotion.rename(columns={'tweet_id': 'id', 'content': 'text', 'sentiment' : 'class'}, inplace=True)
text_emotion = text_emotion.set_index('id')
text_emotion.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
1956967696,sadness,Funeral ceremony...gloomy friday...
1956967789,enthusiasm,wants to hang out with friends SOON!
1956968416,neutral,@dannycastillo We want to trade with someone w...


In [123]:
text_emotion['class'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: class, dtype: int64

In [124]:
text_emotion['class'] = text_emotion['class'].map({
    'neutral': 'neutral',
    # Positives 
#     'enthusiasm': 'positive',
    'happiness': 'positive',
#     'fun': 'positive',
    'love': 'positive',
#     'relief': 'positive',
    # The Negatives
    'sadness': 'negative',
    'anger': 'negative',
    'hate': 'negative',
#     'empty': 'negative',
#     'boredom': 'negative',
})

In [125]:
counts = text_emotion['class'].value_counts()
counts

positive    9051
neutral     8638
negative    6598
Name: class, dtype: int64

In [129]:
len(text_emotion)

24287

In [127]:
text_emotion.dropna(inplace=True)

In [128]:
text_emotion.sample(5)

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1753902379,neutral,cook ; do you wanna measure my dick? its daddy...
1753837126,negative,@nigs Ah so 15 months. I'm sure he is advanced...
1753728048,positive,@beautyholic woohoooo ;) to BOTH! retail thera...
1960949231,negative,@myhaloromance My camera is brokennn
1695252016,positive,@DonnieWahlberg In Toronto waiting for YOU!!!...


In [90]:
if save_custom:
    text_emotion.to_csv('../data/text_emotion/text_emotion_processed.csv')

## Sanders Analytics

In [174]:
sanders_analytics = pd.read_csv(san_analytics_path)
sanders_analytics.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [175]:
sanders_analytics = sanders_analytics.drop(labels=['Topic', 'TweetDate'], axis=1)
sanders_analytics.rename(columns={'TweetId': 'id', 'TweetText': 'text', 'Sentiment' : 'class'}, inplace=True)
sanders_analytics = sanders_analytics.set_index('id')
sanders_analytics = sanders_analytics[sanders_analytics['class'] != 'irrelevant']
sanders_analytics.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
126415614616154112,positive,Now all @Apple has to do is get swype on the i...
126404574230740992,positive,@Apple will be adding more carrier support to ...
126402758403305474,positive,Hilarious @youtube video - guy does a duet wit...
126397179614068736,positive,@RIM you made it too easy for me to switch to ...
126395626979196928,positive,I just realized that the reason I got into twi...


In [176]:
sanders_analytics['class'].value_counts()

neutral     2333
negative     572
positive     519
Name: class, dtype: int64

In [177]:
# if save_custom:
sanders_analytics.to_csv('../data/sanders_analytics/sananalytics_processed.csv')

## Custom Dataset

In [130]:
custom_dataset = sem_eval.copy()

In [131]:
custom_dataset.head()

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T..."
619969366986235905,neutral,Order Go Set a Watchman in store or through ou...
619971047195045888,negative,If these runway renovations at the airport pre...
619974445185302528,neutral,If you could ask an onstage interview question...
619987808317407232,positive,A portion of book sales from our Harper Lee/Go...


In [132]:
counts = custom_dataset['class'].value_counts()
neut_count = counts['neutral']
pos_deficit = neut_count - counts['positive']
neg_deficit = neut_count - counts['negative']

In [133]:
pos_sample = sent_140[sent_140['class'] == 'positive'].sample(pos_deficit)
neg_sample = sent_140[sent_140['class'] == 'negative'].sample(neg_deficit)

In [134]:
pos_sample = pos_sample.drop(labels=['date', 'query', 'user'], axis=1)
neg_sample = neg_sample.drop(labels=['date', 'query', 'user'], axis=1)

In [135]:
custom_dataset['class'].value_counts()

neutral     22211
positive    19625
negative     7732
Name: class, dtype: int64

In [136]:
balancing = pos_sample.append(neg_sample)
custom_dataset = custom_dataset.append(balancing)
custom_dataset['class'].value_counts()

negative    22211
positive    22211
neutral     22211
Name: class, dtype: int64

In [137]:
if save_custom:
    custom_dataset.to_csv('./sem_eval_balanced_with_sent_140.csv')

## Composite Datasets

### SemEval + SandersAnalytics + Sent-140

In [178]:
composite_dataset = sem_eval.copy()
composite_dataset = composite_dataset.append(sanders_analytics)

In [179]:
composite_dataset['class'].value_counts()

neutral     24544
positive    20144
negative     8304
Name: class, dtype: int64

In [180]:
counts = composite_dataset['class'].value_counts()
neut_count = counts['neutral']
pos_deficit = neut_count - counts['positive']
neg_deficit = neut_count - counts['negative']
pos_sample = sent_140[sent_140['class'] == 'positive'].sample(pos_deficit)
neg_sample = sent_140[sent_140['class'] == 'negative'].sample(neg_deficit)
pos_sample = pos_sample.drop(labels=['date', 'query', 'user'], axis=1)
neg_sample = neg_sample.drop(labels=['date', 'query', 'user'], axis=1)
balancing = pos_sample.append(neg_sample)
composite_dataset = composite_dataset.append(balancing)
composite_dataset['class'].value_counts()

negative    24544
positive    24544
neutral     24544
Name: class, dtype: int64

In [181]:
if save_custom:
    composite_dataset.to_csv('../data/custom/sem_eval_sanders_balanced.csv')

### SemEval + TextEmoticon + Sent-140

In [147]:
composite_dataset = sem_eval.copy()
composite_dataset = composite_dataset.append(text_emotion)

In [148]:
composite_dataset['class'].value_counts()

neutral     30849
positive    28676
negative    14330
Name: class, dtype: int64

In [149]:
counts = composite_dataset['class'].value_counts()
neut_count = counts['neutral']
pos_deficit = neut_count - counts['positive']
neg_deficit = neut_count - counts['negative']
pos_sample = sent_140[sent_140['class'] == 'positive'].sample(pos_deficit)
neg_sample = sent_140[sent_140['class'] == 'negative'].sample(neg_deficit)
pos_sample = pos_sample.drop(labels=['date', 'query', 'user'], axis=1)
neg_sample = neg_sample.drop(labels=['date', 'query', 'user'], axis=1)

In [150]:
balancing = pos_sample.append(neg_sample)
composite_dataset = composite_dataset.append(balancing)
composite_dataset['class'].value_counts()

negative    30849
positive    30849
neutral     30849
Name: class, dtype: int64

In [151]:
if save_custom:
    composite_dataset.to_csv('../data/custom/sem_eval_text_emotion_balanced.csv')