# Transfer data preprocessing

## Irony

In [1]:
!ls data/irony

irony_labels.hdf5  semeval18_3


In [2]:
!ls data/irony/semeval18_3/

SemEval2018-T3-train-taskB_emoji.txt


In [3]:
PATH = 'data/irony/semeval18_3/SemEval2018-T3-train-taskB_emoji.txt'

In [4]:
import pandas as pd

In [5]:
df = pd.read_table(PATH)

In [6]:
df.head()

Unnamed: 0,Tweet Index,Label,Tweet text
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,2,I can't breathe! was chosen as the most notabl...


In [7]:
labels = df['Label']
len(labels)

3817

In [8]:
type(labels)

pandas.core.series.Series

In [9]:
import numpy as np

In [10]:
labels = np.array(labels)

In [11]:
type(labels)

numpy.ndarray

In [12]:
from retpred.utils.io import save_array

In [13]:
save_array('data/irony/irony_labels.hdf5', labels, 'irony_labels')

In [14]:
texts = df['Tweet text']

In [15]:
type(texts[0])

str

In [16]:
from retpred.text import tokenize

In [17]:
texts[:5]

0    Sweet United Nations video. Just in time for C...
1    @mrdahl87 We are rumored to have talked to Erv...
2    Hey there! Nice to see you Minnesota/ND Winter...
3                  3 episodes left I'm dying over here
4    I can't breathe! was chosen as the most notabl...
Name: Tweet text, dtype: object

In [18]:
texts = [tokenize(t) for t in texts]

In [19]:
texts[:5]

['sweet united nations video .  just in time for christmas .  <hashtag> imagine <hashtag> no religion  <url>',
 ' <user>  we are rumored to have talked to erv agent .  <repeat> and the angels asked about ed escobar .  <repeat> that hardly nothing     <smile> ',
 'hey there !  nice to see you minnesota / nd <allcaps>  winter weather ',
 ' <number>  episodes left i am dying over here',
 'i can not breathe !  was chosen as the most notable quote of the year in an annual list released by a yale university librarian ']

In [20]:
from retpred.utils.io import save_txt

In [21]:
save_txt('data/irony/irony_texts.txt', texts)

## Topic 

In [23]:
!ls data/topic/us-topics-181109/

train.json


In [24]:
PATH = 'data/topic/us-topics-181109/train.json'

In [25]:
from retpred.utils.io import load_json

In [26]:
topic_data = load_json(fname=PATH)

In [27]:
topic_data

{'1057233267047448577': {'id': 1057233267047448577,
  'text': "Life is challenging enough without self-sabotage. Here's @KimFulcher on how to stop it. https://t.co/OzLMcfe64b",
  'topic': 'news'},
 '1049772181465559042': {'id': 1049772181465559042,
  'text': 'Seventy years ago this December, 58 countries came together at the end of World War II to offer the world a roadmap… https://t.co/WkTBgenSGu',
  'topic': 'government'},
 '1058771571391655938': {'id': 1058771571391655938,
  'text': '@Jkooza Drunkenly walking through the coals.',
  'topic': 'gaming'},
 '1032209530187333632': {'id': 1032209530187333632,
  'text': 'NEW CLASSICS | Introducing our sleek, box-style D-ring bag in a new smaller size https://t.co/v9LhBffW4a https://t.co/b8uZZxsl3b',
  'topic': 'fashion'},
 '1008635429892083712': {'id': 1008635429892083712,
  'text': 'Reminiscent of the sailing rope, red tweed trimming by Lesage adds a sailing touch to the Paris-Hamburg… https://t.co/eBGaF5C2vt',
  'topic': 'fashion'},
 '105

In [31]:
texts = []
labels = []
for tid in topic_data:
    topic = topic_data[tid]
    texts.append(topic['text'])
    labels.append(topic['topic'])

In [32]:
texts[:5]

["Life is challenging enough without self-sabotage. Here's @KimFulcher on how to stop it. https://t.co/OzLMcfe64b",
 'Seventy years ago this December, 58 countries came together at the end of World War II to offer the world a roadmap… https://t.co/WkTBgenSGu',
 '@Jkooza Drunkenly walking through the coals.',
 'NEW CLASSICS | Introducing our sleek, box-style D-ring bag in a new smaller size https://t.co/v9LhBffW4a https://t.co/b8uZZxsl3b',
 'Reminiscent of the sailing rope, red tweed trimming by Lesage adds a sailing touch to the Paris-Hamburg… https://t.co/eBGaF5C2vt']

In [33]:
labels[:5]

['news', 'government', 'gaming', 'fashion', 'fashion']

In [34]:
texts = [tokenize(t) for t in texts]

In [35]:
texts[:5]

['life is challenging enough without self - sabotage .  here  <user>  on how to stop it .  <url>',
 'seventy years ago this december,  <number>  countries came together at the end of world war ii <allcaps>  to offer the world a roadmap  .  <repeat> <url>',
 ' <user>  drunkenly walking through the coals . ',
 'new <allcaps>  classics <allcaps>  | introducing our sleek, box - style d - ring bag in a new smaller size <url> <url>',
 'reminiscent of the sailing rope, red tweed trimming by lesage adds a sailing touch to the paris - hamburg  .  <repeat> <url>']

In [37]:
save_txt('data/topic/topic_texts.txt', texts)

In [38]:
labels = pd.Series(data=labels)

In [43]:
labels = labels.astype('category')

In [48]:
labels.cat.codes[:5]

0    10
1     8
2     7
3     4
4     4
dtype: int8

In [50]:
labels[:5]

0          news
1    government
2        gaming
3       fashion
4       fashion
dtype: category
Categories (13, object): [books, business, entertainment, family, ..., music, news, sports, television]

In [53]:
categories = labels.cat.categories.tolist()

In [54]:
save_txt('data/topic/categories.txt', categories)

In [55]:
labels = np.array(labels.cat.codes)

In [58]:
save_array('data/topic/topic_labels.hdf5', labels, 'topic_labels')

## Offensive language

In [59]:
!ls data/offensive_lang/

davison_2017  hatespeech_waseem_hovy_16


In [60]:
!ls data/offensive_lang/davison_2017/

labeled_data.csv


In [61]:
df = pd.read_csv('data/offensive_lang/davison_2017/labeled_data.csv')
df.head()

Unnamed: 0,id,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [62]:
categories = ['hate_speech', 'offensive_language', 'neither']
save_txt('data/offensive_lang/categories.txt', categories)

In [63]:
texts = df['tweet']
labels = df['class']

In [64]:
texts = [tokenize(t) for t in texts]
texts[:10]

[" !  <repeat> rt <allcaps>   <user>  :  as a woman you shouldn't complain about cleaning up your house .  &amp ;  as a man you should always take the trash out .  <repeat>",
 ' !  <repeat> rt <allcaps>   <user>  :  boy dats cold .  <repeat>tyga dwn bad for cuffin dat hoe in the  <number> st place !  <repeat>',
 ' !  <repeat> rt <allcaps>   <user>  dawg !  <repeat> rt <allcaps>   <user>  :  you ever fuck a bitch and she start to cry ?  you be confused as shit',
 ' !  <repeat> rt <allcaps>   <user>  :   <user>  she look like a tranny',
 ' !  <repeat> rt <allcaps>   <user>  :  the shit you hear about me might be true or it might be faker than the bitch who told it to ya &# <number>  ; ',
 ' !  <repeat>" <user>  :  the shit just blows me .  <repeat>claim you so faithful and down for somebody but still fucking with hoes !  &# <number>  ; &# <number>  ; &# <number>  ; "',
 ' !  <repeat>" <user>  :  i can not just sit up and hate <allcaps>  on another bitch  .  <repeat> i got too much shit g

In [65]:
save_txt('data/offensive_lang/offlang_texts.txt', texts)

In [66]:
labels = np.array(labels)

In [67]:
save_array('data/offensive_lang/offlang_labels.hdf5', labels, 'offlang_labels')

## Sentiment

In [68]:
!ls data/sentiment/

semeval15_11  semeval18_1


In [70]:
!ls data/sentiment/semeval18_1/

2018-E-c-En-train.txt	      EI-oc-En-fear-train.txt
2018-E-c-En-train.zip	      EI-oc-En-joy-train.txt
2018-Valence-oc-En-train.txt  EI-oc-En-sadness-train.txt
2018-Valence-oc-En-train.zip  EI-oc-En-train.zip
EI-oc-En-anger-train.txt


In [77]:
df_ec = pd.read_table('data/sentiment/semeval18_1/2018-E-c-En-train.txt')
print(df_ec.size)
df_ec.head()

88894


Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [76]:
df_voc = pd.read_table('data/sentiment/semeval18_1/2018-Valence-oc-En-train.txt')
print(df_voc.size)
df_voc.head()

4724


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-30153,@liamch88 yeah! :) playing well,valence,0: neutral or mixed emotional state can be inf...
1,2017-En-40929,At least I don't have a guy trying to discoura...,valence,0: neutral or mixed emotional state can be inf...
2,2017-En-22012,UPLIFT: If you're still discouraged it means y...,valence,0: neutral or mixed emotional state can be inf...
3,2017-En-30837,"...at your age, the heyday in the blood is tam...",valence,0: neutral or mixed emotional state can be inf...
4,2017-En-30838,i was so embarrassed when she saw us i was lik...,valence,-2: moderately negative emotional state can be...


In [88]:
labels = df_voc['Intensity Class'].astype('category')

In [90]:
labels = labels.cat.reorder_categories([
    '-3: very negative emotional state can be inferred',
    '-2: moderately negative emotional state can be inferred',
    '-1: slightly negative emotional state can be inferred',
    '0: neutral or mixed emotional state can be inferred',
    '1: slightly positive emotional state can be inferred',
    '2: moderately positive emotional state can be inferred',
    '3: very positive emotional state can be inferred'])

In [91]:
labels.cat.categories

Index(['-3: very negative emotional state can be inferred',
       '-2: moderately negative emotional state can be inferred',
       '-1: slightly negative emotional state can be inferred',
       '0: neutral or mixed emotional state can be inferred',
       '1: slightly positive emotional state can be inferred',
       '2: moderately positive emotional state can be inferred',
       '3: very positive emotional state can be inferred'],
      dtype='object')

In [94]:
categories = labels.cat.categories.tolist()
save_txt('data/sentiment/valence_categories.txt', categories)

In [95]:
labels = np.array(labels.cat.codes)
save_array('data/sentiment/valence_labels.hdf5', labels, 'valence_labels')

In [96]:
texts = df_voc['Tweet']
texts = [tokenize(t) for t in texts]
save_txt('data/sentiment/valence_texts.txt', texts)

In [75]:
df_aoc = pd.read_table('data/sentiment/semeval18_1/EI-oc-En-anger-train.txt')
print(len(df_aoc))
df_aoc.head()

1701


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are c...,anger,2: moderate amount of anger can be inferred
1,2017-En-10072,it makes me so fucking irate jesus. nobody is ...,anger,3: high amount of anger can be inferred
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,1: low amount of anger can be inferred
3,2017-En-11102,@THATSSHAWTYLO passed away early this morning ...,anger,0: no anger can be inferred
4,2017-En-11506,@Kristiann1125 lol wow i was gonna say really?...,anger,1: low amount of anger can be inferred


In [78]:
df_foc = pd.read_table('data/sentiment/semeval18_1/EI-oc-En-fear-train.txt')
print(len(df_foc))
df_foc.head()

2252


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-20968,@RockSolidShow @Pat_Francis #revolting cocks i...,fear,0: no fear can be inferred
1,2017-En-21816,@Its_just_Huong I will beat you !!! Always tho...,fear,1: low amount of fear can be inferred
2,2017-En-21532,“What worries you masters you.” - Haddon Robin...,fear,1: low amount of fear can be inferred
3,2017-En-20740,@carlybigelow13 first you take the room now yo...,fear,0: no fear can be inferred
4,2017-En-20022,@RogueCoder250 We are in so much trouble!! I d...,fear,3: high amount of fear can be inferred


In [79]:
df_joc = pd.read_table('data/sentiment/semeval18_1/EI-oc-En-joy-train.txt')
print(len(df_joc))
df_joc.head()

1616


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-30793,"@david_garrett Quite saddened.....no US dates,...",joy,0: no joy can be inferred
1,2017-En-30070,2 days until #GoPackGo and 23 days until #GoGi...,joy,3: high amount of joy can be inferred
2,2017-En-30692,Positive #psychology research shows salespeopl...,joy,0: no joy can be inferred
3,2017-En-31323,As the birds chirp and the cows moo we need to...,joy,1: low amount of joy can be inferred
4,2017-En-31553,Howling with laughter at “WELL DONE BEZZA!” #b...,joy,3: high amount of joy can be inferred


In [80]:
df_soc = pd.read_table('data/sentiment/semeval18_1/EI-oc-En-sadness-train.txt')
print(len(df_soc))
df_soc.head()

1533


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-40023,This the most depressing shit ever,sadness,3: high amount of sadness can be inferred
1,2017-En-40983,final vestiges of my 90's childhood were just ...,sadness,0: no sadness can be inferred
2,2017-En-20751,@ManUnitedWriter He has had a dreadful first h...,sadness,1: low amount of sadness can be inferred
3,2017-En-41232,feel really sad and down today😒,sadness,3: high amount of sadness can be inferred
4,2017-En-40797,Wow just watched Me Before You and it was seri...,sadness,2: moderate amount of sadness can be inferred


In [85]:
affects = ['anger', 'fear', 'joy', 'sadness']
for a in affects:
    print('processing {} dataset'.format(a))
    path = 'data/sentiment/semeval18_1/EI-oc-En-{}-train.txt'.format(a)
    df = pd.read_table(path)
    texts = df['Tweet']
    texts = [tokenize(t) for t in texts]
    save_txt('data/sentiment/{}_texts.txt'.format(a), texts)
    labels = np.array(df['Intensity Class'].astype('category').cat.codes)
    save_array('data/sentiment/{}_labels.hdf5'.format(a), labels, '{}_labels'.format(a))

processing anger dataset
processing fear dataset
processing joy dataset
processing sadness dataset


In [97]:
!ls data/sentiment/

anger_labels.hdf5  joy_texts.txt	valence_categories.txt
anger_texts.txt    sadness_labels.hdf5	valence_labels.hdf5
fear_labels.hdf5   sadness_texts.txt	valence_texts.txt
fear_texts.txt	   semeval15_11
joy_labels.hdf5    semeval18_1
