# Exploring Twitter API

## Importing Packages

In [1]:
import pandas as pd
import json
import re
import requests
import pickle
%reload_ext autoreload
%autoreload 2
from utils import *

## Aristotle University Dataset

Citation:

Founta, A., Djouvas, C., Chatzakou, D., Leontiadis, I., Blackburn, J., Stringhini, G., Vakali, A., Sirivianos, M. and Kourtellis, N., 2018. Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior \[Data file\]. ArXiv. Retrieved from: https://dataverse.mpi-sws.org/dataset.xhtml?persistentId=doi:10.5072/FK2/ZDTEMN

Publication: https://arxiv.org/pdf/1802.00393.pdf

In [2]:
df = pd.read_csv('../data/hatespeechtwitter.csv')
df.head()

Unnamed: 0,tweet_id,maj_label
0,849667487180259329,abusive
1,850490912954351616,abusive
2,848791766853668864,abusive
3,848306464892604416,abusive
4,850010509969465344,normal


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   80000 non-null  int64 
 1   maj_label  79996 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


In [4]:
df.columns = ['id', 'label']
df.label.value_counts()

normal     52835
spam       13404
abusive    10122
hateful     3635
Name: label, dtype: int64

In [5]:
df_hateful = df[df['label']=='hateful']
df_hateful

Unnamed: 0,id,label
5,850433664890544128,hateful
11,850449456445235200,hateful
12,850100742035836929,hateful
13,847945888995708928,hateful
21,847804507367100416,hateful
...,...,...
79853,849246676854484992,hateful
79909,849635522389651456,hateful
79917,850456293181538304,hateful
79922,850308804663181312,hateful


In [6]:
hate_ids = group_list(list(df_hateful.id))
len(hate_ids)

37

In [8]:
df_hate = tweets_request(hate_ids)

100%|██████████| 37/37 [00:24<00:00,  1.53it/s]


In [9]:
df_hate = df_hate.reset_index(drop=True)
df_hate

Unnamed: 0,created_at,id,text,author_id
0,2017-04-07T19:42:40.000Z,850433664890544128,I hate er chase because if the Bitch that work...,399817418
1,2017-04-06T21:39:45.000Z,850100742035836929,RT @nyctophil3: Pineapples do not belong on pi...,928512230
2,2017-03-31T13:35:20.000Z,847804507367100416,Niggas keep talking about women wearing weave ...,46267097
3,2017-04-06T20:58:20.000Z,850090319165050880,@vappywave idiot that's not gonna work. you go...,14413604
4,2017-04-03T00:00:48.000Z,848686686930382848,RT @ayevonnn: bruh i fucking hate people like ...,3137373930
...,...,...,...,...
1758,2017-04-08T13:25:50.000Z,850701219568001025,Baekhyun won 'Most Popular Artist of Korea'\n\...,465448529
1759,2017-04-04T17:42:11.000Z,849316180670259205,#TripleTalaqArrest 2fgt dis social menace #Tri...,179136092
1760,2017-04-05T17:26:51.000Z,849674709763358720,White nationalists' latest tactic to recruit c...,60473620
1761,2017-04-04T13:06:00.000Z,849246676854484992,We are in a comfortable lead fight against gal...,736124767518986244


# University of Copenhagen Dataset

** Reference: **

Waseem, Z., Hovy, D. (2016). Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter \[Data file\]. In: *Proceedings of the NAACL Student Research Workshop*. San Diego, Calfornia: Association for Computational Linguistics, pages 88-93. Retrieved from: https://github.com/ZeerakW/hatespeech.

Publication:  https://www.aclweb.org/anthology/N16-2013.pdf

In [12]:
df2 = pd.read_csv('../data/NAACL_SRW_2016.csv')
df2.columns = ['id', 'label']
df2.head()

Unnamed: 0,id,label
0,572341498827522049,racism
1,572340476503724032,racism
2,572334712804384768,racism
3,572332655397629952,racism
4,575949086055997440,racism


In [13]:
df2.label.value_counts()

none      11559
sexism     3378
racism     1969
Name: label, dtype: int64

In [14]:
df_racsex = df2[(df2['label']=='racism') | (df2['label']=='sexism')]

In [15]:
racsex_id = group_list(list(df_racsex.id))
df_rac_sex = tweets_request(racsex_id)

100%|██████████| 54/54 [00:26<00:00,  2.01it/s]


In [16]:
df_rac_sex = df_rac_sex.reset_index(drop=True)
df_rac_sex

Unnamed: 0,text,author_id,id,created_at
0,Drasko they didn't cook half a bird you idiot ...,110114783,572341498827522049,2015-03-02T10:23:41.000Z
1,Hopefully someone cooks Drasko in the next ep ...,38650214,572340476503724032,2015-03-02T10:19:37.000Z
2,of course you were born in serbia...you're as ...,2587278392,572334712804384768,2015-03-02T09:56:43.000Z
3,These girls are the equivalent of the irritati...,2601524623,572332655397629952,2015-03-02T09:48:33.000Z
4,RT @YesYoureRacist: At least you're only a tin...,930620467,446460991396917248,2014-03-20T01:39:29.000Z
...,...,...,...,...
2734,"RT @Superjutah: @MT8_9 £34,000 is £9000 above ...",2756873076,569990621315919872,2015-02-23T22:42:08.000Z
2735,RT @DufresneFrank: “@gabystama: @DufresneFrank...,2756873076,570244879265206272,2015-02-24T15:32:28.000Z
2736,RT @BoycottBrandy: @ActionFlickDoc this person...,2756873076,571013698111860736,2015-02-26T18:27:29.000Z
2737,RT @Lyall: Patriarchy™ will expel me if I divu...,2756873076,571304517930774528,2015-02-27T13:43:06.000Z


In [17]:
pickle.dump(df_rac_sex, open("../pickle/copenhagen.pickle", "wb"))

# Georgia Tech Dataset

In [18]:
df3 = pd.read_csv("../data/hate.csv")
df3.columns = ['id', 'user', 'hate', 'chate', 'neutral', 'label']

In [19]:
df_anti_asian = df3.sort_values(by = ['hate'], ascending=False)
df_anti_asian.label = 1
df_anti_asian

Unnamed: 0,id,user,hate,chate,neutral,label
560314,1240976182494908416,884055068265164801,0.999974,0.000520,0.000382,1
515476,1238239331879464960,1136405401770106880,0.999966,0.000625,0.000105,1
396712,1247813175996792834,1234197517006450688,0.999959,0.003865,0.000018,1
424027,1245317927823233024,1232772980763054081,0.999958,0.001476,0.000318,1
380359,1241373862756888577,907414928,0.999952,0.018423,0.000788,1
...,...,...,...,...,...,...
596002,1236335083738537985,4384833496,0.500002,0.004678,0.297274,1
334098,1250769238274572291,1199735925355556864,0.500002,0.087211,0.045595,1
889914,1232245722831052801,1053001766806200322,0.500001,0.004162,0.253713,1
198091,1249377836646924291,223299902,0.500001,0.003931,0.210026,1


In [20]:
anti_asian_ids = group_list(list(df_anti_asian.id))
len(anti_asian_ids)

8913

In [21]:
asian_100 = anti_asian_ids[0:100]
asian_200 = anti_asian_ids[100:200]
asian_300 = anti_asian_ids[200:300]

In [22]:
df_1 = tweets_request(asian_100)

100%|██████████| 100/100 [01:10<00:00,  1.43it/s]


In [23]:
df_2 = tweets_request(asian_200)

100%|██████████| 100/100 [01:08<00:00,  1.47it/s]


In [24]:
df_3 = tweets_request(asian_300)

100%|██████████| 100/100 [01:06<00:00,  1.51it/s]


In [27]:
df_asian = pd.concat([df_1, df_2, df_3], axis=0)
df_asian = df_asian.reset_index(drop=True)
df_asian

Unnamed: 0,author_id,id,text,created_at
0,884055068265164801,1240976182494908416,Fuck the ding dongs. Fuck the ching chongs. An...,2020-03-20T12:19:12.000Z
1,1136405401770106880,1238239331879464960,Fuck china fuck China fuck China\nFuck China f...,2020-03-12T23:03:56.000Z
2,907414928,1241373862756888577,"I do n’t believe this is eating bats, which is...",2020-03-21T14:39:27.000Z
3,1134698470760017920,1238588936391290880,"@Peoples_Pundit CCP Wuhan Virus, more Chinese ...",2020-03-13T22:13:08.000Z
4,35525602,1240774162760617984,Dr. Sean Lin said call it what it is - #ChinaV...,2020-03-19T22:56:27.000Z
...,...,...,...,...
17370,3193940959,1249362085248524289,@Oceanle15689731 corona is from China because ...,2020-04-12T15:41:47.000Z
17371,437970401,1240390179048824833,President Donald Trump and Congress should tel...,2020-03-18T21:30:38.000Z
17372,409760405,1246240273455296512,@almostzdq @cgtnamerica @CGTNOfficial Hard to ...,2020-04-04T00:56:49.000Z
17373,1279408782,1242810006715527168,@MarshaBlackburn @RepJimBanks @teammoulton #Ch...,2020-03-25T13:46:10.000Z


In [28]:
pickle.dump(df_asian, open("../pickle/georgia.pickle", "wb"))