# Notebook for Twitter API

## Importing Packages

In [6]:
import pandas as pd
import json
import re
import requests
import pickle

%reload_ext autoreload
%autoreload 2

import sys
sys.path.append("../py")
from utils import *
from config import keys

## Aristotle University Dataset

Citation:

Founta, A., Djouvas, C., Chatzakou, D., Leontiadis, I., Blackburn, J., Stringhini, G., Vakali, A., Sirivianos, M. and Kourtellis, N., 2018. Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior \[Data file\]. ArXiv. Retrieved from: https://dataverse.mpi-sws.org/dataset.xhtml?persistentId=doi:10.5072/FK2/ZDTEMN

Publication: https://arxiv.org/pdf/1802.00393.pdf

In [50]:
df = pd.read_csv('../data/hatespeechtwitter.csv')
df.head()

Unnamed: 0,tweet_id,maj_label
0,849667487180259329,abusive
1,850490912954351616,abusive
2,848791766853668864,abusive
3,848306464892604416,abusive
4,850010509969465344,normal


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   80000 non-null  int64 
 1   maj_label  79996 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


In [52]:
df.columns = ['id', 'label']
df.label.value_counts()

normal     52835
spam       13404
abusive    10122
hateful     3635
Name: label, dtype: int64

In [54]:
df_hateful = df[df['label']=='hateful']
df_hateful.head(10)

Unnamed: 0,id,label
5,850433664890544128,hateful
11,850449456445235200,hateful
12,850100742035836929,hateful
13,847945888995708928,hateful
21,847804507367100416,hateful
33,850090319165050880,hateful
47,848600351381098496,hateful
49,850672203372916736,hateful
55,849303652280020995,hateful
60,849300057782456321,hateful


In [53]:
df_abusive = df[df['label']=='abusive']
df_abusive.head(10)

Unnamed: 0,id,label
0,849667487180259329,abusive
1,850490912954351616,abusive
2,848791766853668864,abusive
3,848306464892604416,abusive
6,847529600108421121,abusive
7,848619867506913282,abusive
8,850411934205845504,abusive
9,848325397985071104,abusive
10,849087242987593728,abusive
14,850577240127623169,abusive


In [55]:
df_neutral = df[df['label']=='normal']
df_neutral.head(10)

Unnamed: 0,id,label
4,850010509969465344,normal
27,850344984742174720,normal
28,847482196096987137,normal
31,850660404770590720,normal
40,849881409284182016,normal
41,848437713061634052,normal
44,848926030723031040,normal
51,848975292794318848,normal
57,850346419164553218,normal
89,848338236770582529,normal


In [6]:
hate_ids = group_list(list(df_hateful.id))
len(hate_ids)

37

In [56]:
abusive_ids = group_list(list(df_abusive.id))
len(abusive_ids)

102

In [57]:
neutral_ids = group_list(list(df_neutral.id))
len(neutral_ids)

529

In [14]:
url = "https://api.twitter.com/2/tweets?ids=847661947159891972,847799130277675008,848933211375779840&tweet.fields=created_at,entities,geo,id,public_metrics,text&user.fields=description,entities,id,location,name,public_metrics,username"
payload={}
headers = {'Authorization': 'Bearer ' + keys['bearer_token'], 'Cookie': 'personalization_id="v1_hzpv7qXpjB6CteyAHDWYQQ=="; guest_id=v1%3A161498381400435837'}
r = requests.request("GET", url, headers=headers, data=payload)
data = r.json()

In [16]:
df_hate = tweets_request(hate_ids)

100%|██████████| 37/37 [00:22<00:00,  1.66it/s]


In [68]:
df_hate = df_hate.reset_index(drop=True)
df_hate.head(10)

Unnamed: 0,created_at,text,id,public_metrics,entities,geo
0,2017-04-07T19:42:40.000Z,I hate er chase because if the Bitch that work...,850433664890544128,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,
1,2017-04-06T21:39:45.000Z,RT @nyctophil3: Pineapples do not belong on pi...,850100742035836929,"{'retweet_count': 5, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 14, 'usernam...",
2,2017-03-31T13:35:20.000Z,Niggas keep talking about women wearing weave ...,847804507367100416,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",,
3,2017-04-06T20:58:20.000Z,@vappywave idiot that's not gonna work. you go...,850090319165050880,"{'retweet_count': 0, 'reply_count': 1, 'like_c...","{'mentions': [{'start': 0, 'end': 10, 'usernam...",
4,2017-04-03T00:00:48.000Z,RT @ayevonnn: bruh i fucking hate people like ...,848686686930382848,"{'retweet_count': 5, 'reply_count': 0, 'like_c...","{'urls': [{'start': 53, 'end': 76, 'url': 'htt...",
5,2017-04-07T16:11:57.000Z,RT @mattmfm: I'm really fucking sick of watchi...,850380636300820480,"{'retweet_count': 916, 'reply_count': 0, 'like...","{'annotations': [{'start': 53, 'end': 68, 'pro...",
6,2017-03-31T03:33:05.000Z,@JayFoee_ just another dumbass bronco fan swea...,847652946217009155,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,
7,2017-04-03T09:52:55.000Z,RT @THESLUMPGOD: I Sampled Jaws \n\nPull Up Wi...,848835698006401024,"{'retweet_count': 6131, 'reply_count': 0, 'lik...","{'mentions': [{'start': 3, 'end': 15, 'usernam...",
8,2017-04-07T05:17:15.000Z,RT @arianam0lina: if you litter you're a bitch...,850215875680522240,"{'retweet_count': 19, 'reply_count': 0, 'like_...","{'mentions': [{'start': 3, 'end': 16, 'usernam...",
9,2017-04-06T09:53:03.000Z,RT @cybeque: Don't take out the anger of being...,849922895132459008,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","{'urls': [{'start': 74, 'end': 97, 'url': 'htt...",


In [58]:
df_abusive = tweets_request(abusive_ids)

100%|██████████| 102/102 [01:01<00:00,  1.65it/s]


In [69]:
df_abusive = df_abusive.reset_index(drop=True)
df_abusive.head(10)

Unnamed: 0,created_at,text,public_metrics,entities,id,geo
0,2017-04-07T23:30:09.000Z,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'hashtags': [{'start': 26, 'end': 42, 'tag': ...",850490912954351616,
1,2017-04-03T06:58:21.000Z,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...","{'retweet_count': 47, 'reply_count': 0, 'like_...","{'mentions': [{'start': 3, 'end': 11, 'usernam...",848791766853668864,
2,2017-04-02T19:35:17.000Z,RT @chevleia: don't hmu when u get tired of ur...,"{'retweet_count': 756, 'reply_count': 0, 'like...","{'mentions': [{'start': 3, 'end': 12, 'usernam...",848619867506913282,
3,2017-04-07T18:16:19.000Z,RT @ashllyd: SICK OF BITCHES ON THE INTERNET 🐍...,"{'retweet_count': 1, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 11, 'usernam...",850411934205845504,
4,2017-04-04T15:29:55.000Z,But he still with the shits so he started smok...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'urls': [{'start': 101, 'end': 124, 'url': 'h...",849282894682050564,
5,2017-04-02T11:04:55.000Z,RT @Configa: April Fools fucking #dope If you ...,"{'retweet_count': 9, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 11, 'usernam...",848491429517295616,
6,2017-03-30T19:41:20.000Z,Not having access to my money is fucking pissi...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,847534226413105152,{'place_id': '1d9a5370a355ab0c'}
7,2017-04-05T09:59:54.000Z,"""God, you're fucking pathetic."" https://t.co/u...","{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'annotations': [{'start': 1, 'end': 3, 'proba...",849562231129993216,
8,2017-03-31T16:42:00.000Z,You Worried About Somebody Bein Ugly... Bitch ...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",,847851483571814400,
9,2017-04-02T22:49:05.000Z,Damn dean just put Corbin to sleep. That Match...,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'annotations': [{'start': 5, 'end': 8, 'proba...",848668638869671939,


In [60]:
neutral_1 = neutral_ids[0:100]
neutral_2 = neutral_ids[100:200]
neutral_3 = neutral_ids[200:300]
neutral_4 = neutral_ids[300:400]
neutral_5 = neutral_ids[400:]

In [61]:
df_neutral_1 = tweets_request(neutral_1)

100%|██████████| 100/100 [01:01<00:00,  1.62it/s]


In [62]:
df_neutral_2 = tweets_request(neutral_2)

100%|██████████| 100/100 [01:00<00:00,  1.66it/s]


In [63]:
df_neutral_3 = tweets_request(neutral_3)

100%|██████████| 100/100 [01:00<00:00,  1.65it/s]


In [64]:
df_neutral_4 = tweets_request(neutral_4)

100%|██████████| 100/100 [01:02<00:00,  1.59it/s]


In [65]:
df_neutral_5 = tweets_request(neutral_5)

100%|██████████| 129/129 [01:25<00:00,  1.51it/s]


In [70]:
df_neutral = pd.concat([df_neutral_1, df_neutral_2, df_neutral_3, df_neutral_4, df_neutral_5], axis=0)
df_neutral = df_neutral.reset_index(drop=True)
df_neutral.head(10)

Unnamed: 0,created_at,text,entities,public_metrics,id,geo,withheld
0,2017-04-06T15:41:12.000Z,RT @MailOnline: The Nazi death gas so horrific...,"{'mentions': [{'start': 3, 'end': 14, 'usernam...","{'retweet_count': 17, 'reply_count': 0, 'like_...",850010509969465344,,
1,2017-04-07T13:50:17.000Z,Carlos Correa had gyalchester as his walkup mu...,"{'annotations': [{'start': 0, 'end': 12, 'prob...","{'retweet_count': 2, 'reply_count': 0, 'like_c...",850344984742174720,,
2,2017-04-08T10:43:39.000Z,"""THE FORCE AWAKENS: A Bad Lip Reading"" (Featur...","{'annotations': [{'start': 1, 'end': 36, 'prob...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",850660404770590720,,
3,2017-04-03T15:51:52.000Z,RT @HeeeyMonica: Papaya has to be the worst fr...,"{'mentions': [{'start': 3, 'end': 15, 'usernam...","{'retweet_count': 2, 'reply_count': 0, 'like_c...",848926030723031040,,
4,2017-04-03T19:07:37.000Z,@Pineaqples @DenialEsports btw I watched where...,"{'mentions': [{'start': 0, 'end': 11, 'usernam...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",848975292794318848,,
5,2017-04-07T13:55:59.000Z,"@NikkisBubble Every bird turd is talking ""Chil...","{'mentions': [{'start': 0, 'end': 13, 'usernam...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",850346419164553218,,
6,2017-04-02T00:56:11.000Z,Dick Tracy Meets Gruesome - the 2017 re-boot\n...,"{'annotations': [{'start': 5, 'end': 9, 'proba...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",848338236770582529,,
7,2017-03-31T10:38:47.000Z,"Up at 2am, still sick, dr. Pissing me off, ugh...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",847760077108822016,,
8,2017-04-02T16:29:39.000Z,Something is deeply wrong with him! That and t...,"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",848573151328043011,,
9,2017-04-03T07:08:10.000Z,"Yang susah itu yang disini (dada;hati), that u...",,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",848794237306937344,,


In [67]:
pickle.dump(df_hate, open("../pickle/aristotle_hate.pickle", "wb"))
# pickle.dump(df_abusive, open("../pickle/aristotle_abusive.pickle", "wb"))
# pickle.dump(df_neutral, open("../pickle/aristotle_neutral.pickle", "wb"))

# University of Copenhagen Dataset

** Reference: **

Waseem, Z., Hovy, D. (2016). Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter \[Data file\]. In: *Proceedings of the NAACL Student Research Workshop*. San Diego, Calfornia: Association for Computational Linguistics, pages 88-93. Retrieved from: https://github.com/ZeerakW/hatespeech.

Publication:  https://www.aclweb.org/anthology/N16-2013.pdf

In [46]:
df2 = pd.read_csv('../data/NAACL_SRW_2016.csv')
df2.columns = ['id', 'label']
df2.head()

Unnamed: 0,id,label
0,572341498827522049,racism
1,572340476503724032,racism
2,572334712804384768,racism
3,572332655397629952,racism
4,575949086055997440,racism


In [47]:
df2.label.value_counts()

none      11559
sexism     3378
racism     1969
Name: label, dtype: int64

In [21]:
df_racsex = df2[(df2['label']=='racism') | (df2['label']=='sexism')]

In [22]:
racsex_id = group_list(list(df_racsex.id))
df_rac_sex = tweets_request(racsex_id)

100%|██████████| 54/54 [00:25<00:00,  2.08it/s]


In [23]:
df_rac_sex = df_rac_sex.reset_index(drop=True)
df_rac_sex

Unnamed: 0,text,id,geo,created_at,public_metrics,entities
0,Drasko they didn't cook half a bird you idiot ...,572341498827522049,"{'place_id': '017453ae077eafd3', 'coordinates'...",2015-03-02T10:23:41.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'hashtags': [{'start': 46, 'end': 50, 'tag': ..."
1,Hopefully someone cooks Drasko in the next ep ...,572340476503724032,,2015-03-02T10:19:37.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'annotations': [{'start': 24, 'end': 29, 'pro..."
2,of course you were born in serbia...you're as ...,572334712804384768,,2015-03-02T09:56:43.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'annotations': [{'start': 27, 'end': 32, 'pro..."
3,These girls are the equivalent of the irritati...,572332655397629952,,2015-03-02T09:48:33.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","{'hashtags': [{'start': 95, 'end': 99, 'tag': ..."
4,RT @YesYoureRacist: At least you're only a tin...,446460991396917248,,2014-03-20T01:39:29.000Z,"{'retweet_count': 46, 'reply_count': 0, 'like_...","{'mentions': [{'start': 3, 'end': 18, 'usernam..."
...,...,...,...,...,...,...
2721,"RT @Superjutah: @MT8_9 £34,000 is £9000 above ...",569990621315919872,,2015-02-23T22:42:08.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","{'urls': [{'start': 94, 'end': 116, 'url': 'ht..."
2722,RT @DufresneFrank: “@gabystama: @DufresneFrank...,570244879265206272,,2015-02-24T15:32:28.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 17, 'usernam..."
2723,RT @BoycottBrandy: @ActionFlickDoc this person...,571013698111860736,,2015-02-26T18:27:29.000Z,"{'retweet_count': 3, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 17, 'usernam..."
2724,RT @Lyall: Patriarchy™ will expel me if I divu...,571304517930774528,,2015-02-27T13:43:06.000Z,"{'retweet_count': 2, 'reply_count': 0, 'like_c...","{'mentions': [{'start': 3, 'end': 9, 'username..."


In [24]:
pickle.dump(df_rac_sex, open("../pickle/copenhagen_2.pickle", "wb"))

# Georgia Tech Dataset

In [27]:
df3 = pd.read_csv("../data/hate.csv")
df3.columns = ['id', 'user', 'hate', 'chate', 'neutral', 'label']

In [28]:
df_anti_asian = df3.sort_values(by = ['hate'], ascending=False)
df_anti_asian.label = 1
df_anti_asian

Unnamed: 0,id,user,hate,chate,neutral,label
560314,1240976182494908416,884055068265164801,0.999974,0.000520,0.000382,1
515476,1238239331879464960,1136405401770106880,0.999966,0.000625,0.000105,1
396712,1247813175996792834,1234197517006450688,0.999959,0.003865,0.000018,1
424027,1245317927823233024,1232772980763054081,0.999958,0.001476,0.000318,1
380359,1241373862756888577,907414928,0.999952,0.018423,0.000788,1
...,...,...,...,...,...,...
596002,1236335083738537985,4384833496,0.500002,0.004678,0.297274,1
334098,1250769238274572291,1199735925355556864,0.500002,0.087211,0.045595,1
889914,1232245722831052801,1053001766806200322,0.500001,0.004162,0.253713,1
198091,1249377836646924291,223299902,0.500001,0.003931,0.210026,1


In [29]:
anti_asian_ids = group_list(list(df_anti_asian.id))
len(anti_asian_ids)

8913

In [30]:
df_anti_asian = df3.sort_values(by = ['hate'], ascending=False)
df_anti_asian.label = 1
asian_100 = anti_asian_ids[0:100]
asian_200 = anti_asian_ids[100:200]
asian_300 = anti_asian_ids[200:300]

In [31]:
df_1 = tweets_request(asian_100)

100%|██████████| 100/100 [01:08<00:00,  1.46it/s]


In [32]:
df_2 = tweets_request(asian_200)

100%|██████████| 100/100 [01:04<00:00,  1.54it/s]


In [33]:
df_3 = tweets_request(asian_300)

100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


In [48]:
df_asian = pd.concat([df_1, df_2, df_3], axis=0)
df_asian = df_asian.reset_index(drop=True)
df_asian

Unnamed: 0,public_metrics,text,entities,created_at,id,geo
0,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Not the people of China but the Chinese Commun...,"{'hashtags': [{'start': 56, 'end': 65, 'tag': ...",2020-04-04T15:40:23.000Z,1246462627158491136,
1,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",182 countries are currently suffering the scou...,"{'hashtags': [{'start': 82, 'end': 88, 'tag': ...",2020-04-05T23:59:06.000Z,1246950520914313216,
2,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",OH SHIT OH SHIT OH SHIT OH SHIT OH SHIT OH SHI...,"{'urls': [{'start': 275, 'end': 298, 'url': 'h...",2020-03-17T18:10:23.000Z,1239977396473860096,{'place_id': '006b2a835a30b702'}
3,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@RealJamesWoods Either #WuhanVirus or #Chinese...,"{'hashtags': [{'start': 23, 'end': 34, 'tag': ...",2020-03-18T18:50:54.000Z,1240349980738506752,
4,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@NYGovCuomo #China lied about the #WuhanVirus ...,"{'hashtags': [{'start': 12, 'end': 18, 'tag': ...",2020-04-05T11:56:23.000Z,1246768643913814019,
...,...,...,...,...,...,...
16764,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@Oceanle15689731 corona is from China because ...,"{'annotations': [{'start': 32, 'end': 36, 'pro...",2020-04-12T15:41:47.000Z,1249362085248524289,
16765,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",President Donald Trump and Congress should tel...,"{'annotations': [{'start': 10, 'end': 21, 'pro...",2020-03-18T21:30:38.000Z,1240390179048824833,
16766,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",@almostzdq @cgtnamerica @CGTNOfficial Hard to ...,"{'mentions': [{'start': 0, 'end': 10, 'usernam...",2020-04-04T00:56:49.000Z,1246240273455296512,
16767,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@MarshaBlackburn @RepJimBanks @teammoulton #Ch...,"{'annotations': [{'start': 88, 'end': 92, 'pro...",2020-03-25T13:46:10.000Z,1242810006715527168,


In [49]:
pickle.dump(df_asian, open("../pickle/georgia_2.pickle", "wb"))

# HASOC 2019 and 2020 Dataset

In [11]:
hasoc = pd.read_excel("../data/original/hasoc2019_en.xlsx", engine='openpyxl')
hasoc

TypeError: int() argument must be a string, a bytes-like object or a number, not '_NoValueType'

In [12]:
neutral = hasoc[hasoc['task1'] == 'NOT']
neutral

NameError: name 'hasoc' is not defined

In [4]:
hasoc_neutral_id = group_list(list(neutral.tweet_id))
df_hasoc_neutral = tweets_request(hasoc_neutral_id)

NameError: name 'group_list' is not defined

In [6]:
hate = hasoc[hasoc['task2'] == 'HATE']
hate

Unnamed: 0,tweet_id,text,task1,task2,ID
43,1130312562124267520,@Boogie2988 I’m assuming you are talking about...,HOF,HATE,hasoc_2020_en_3785
58,1126848394184482816,RT @realDonaldTrump: James Comey is a disgrace...,HOF,HATE,hasoc_2020_en_422
59,1123605475789033472,RT @DatNiggaBooty: Put yo face in his ass 🍑😝💦 ...,HOF,HATE,hasoc_2020_en_2563
71,1126818526558347264,RT @sohmer: @realDonaldTrump The Importer pays...,HOF,HATE,hasoc_2020_en_4070
74,1126861128091480064,RT @readkropotkin: We hate nazis right?? We ex...,HOF,HATE,hasoc_2020_en_3341
...,...,...,...,...,...
3494,1126977188660752384,RT @DVATW: British Muslims.\n*More than half o...,HOF,HATE,hasoc_2020_en_2421
3531,1126814646806011905,RT @JoeyNoCollusion: You had a coup against th...,HOF,HATE,hasoc_2020_en_3786
3537,1126819247982817281,@GOPChairwoman @realDonaldTrump Cause they nev...,HOF,HATE,hasoc_2020_en_4936
3622,1127069249477009408,RT @evewhite5500: Father of Colorado school sh...,HOF,HATE,hasoc_2020_en_1838


In [11]:
hasoc_hate_id = group_list(list(hate.tweet_id))
df_hasoc_hate = tweets_request(hasoc_hate_id)

100%|██████████| 2/2 [00:01<00:00,  1.68it/s]


In [8]:
offensive = hasoc[hasoc['task2'] == 'OFFN']
offensive

Unnamed: 0,tweet_id,text,task1,task2,ID
10,1123800523499552768,Nobody:\nAngie: U getting dick ? Cus ur ass lo...,HOF,OFFN,hasoc_2020_en_2451
12,1123589327718686720,RT @tikkkii: bitches in real life be bird as f...,HOF,OFFN,hasoc_2020_en_1111
20,1130320959146057728,RT @melissafumeros: i mean honestly i’ve seen ...,HOF,OFFN,hasoc_2020_en_884
42,1126769344149483520,RT @blktoppa: Look at that ass!! https://t.co/...,HOF,OFFN,hasoc_2020_en_1929
50,1130236603291643905,Shame on you,HOF,OFFN,hasoc_2020_en_2855
...,...,...,...,...,...
3665,1123643157420417026,@funder Keep smiling @LindseyGrahamSC you trum...,HOF,OFFN,hasoc_2020_en_496
3676,1123554686949310465,👫☝ https://t.co/68LexxZh3z little cute girl fu...,HOF,OFFN,hasoc_2020_en_1336
3681,1126913259087339520,RT @DrGPradhan: Island booked for PM\n\nCan @R...,HOF,OFFN,hasoc_2020_en_878
3683,1130197080356655105,do it better bitch!,HOF,OFFN,hasoc_2020_en_5256


In [12]:
hasoc_offensive_id = group_list(list(offensive.tweet_id))
df_hasoc_offensive = tweets_request(hasoc_offensive_id)

100%|██████████| 4/4 [00:02<00:00,  1.77it/s]


In [13]:
pickle.dump(df_hasoc_hate, open("../pickle/hasoc_hate.pickle", "wb"))
pickle.dump(df_hasoc_offensive, open("../pickle/hasoc_offensive.pickle", "wb"))
pickle.dump(df_hasoc_neutral, open("../pickle/hasoc_neutral.pickle", "wb"))

# ACL 2017 Dataset 

In [10]:
sexist = pd.read_csv("../data/original/hostile_sexist.tsv", delimiter="\t")

In [12]:
sexist.columns = ['tweet_id']
sexist.head()

Unnamed: 0,tweet_id
0,572348198062170112
1,572348106202750976
2,572319306387599360
3,572347842456522752
4,572347584339046402


In [None]:
anti_asian_ids = group_list(list(df_anti_asian.id))
len(anti_asian_ids)