# Exploring Twitter API

## Importing Packages

In [4]:
import pandas as pd
import json
import re
import requests
import pickle
%reload_ext autoreload
%autoreload 2
from utils import *

## Aristotle University Dataset

Citation:

Founta, A., Djouvas, C., Chatzakou, D., Leontiadis, I., Blackburn, J., Stringhini, G., Vakali, A., Sirivianos, M. and Kourtellis, N., 2018. Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior \[Data file\]. ArXiv. Retrieved from: https://dataverse.mpi-sws.org/dataset.xhtml?persistentId=doi:10.5072/FK2/ZDTEMN

Publication: https://arxiv.org/pdf/1802.00393.pdf

In [5]:
df = pd.read_csv('../data/hatespeechtwitter.csv')
df.head()

Unnamed: 0,tweet_id,maj_label
0,849667487180259329,abusive
1,850490912954351616,abusive
2,848791766853668864,abusive
3,848306464892604416,abusive
4,850010509969465344,normal


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   80000 non-null  int64 
 1   maj_label  79996 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


In [7]:
df.columns = ['id', 'label']
df.label.value_counts()

normal     52835
spam       13404
abusive    10122
hateful     3635
Name: label, dtype: int64

In [8]:
df_hateful = df[df['label']=='hateful']
df_hateful

Unnamed: 0,id,label
5,850433664890544128,hateful
11,850449456445235200,hateful
12,850100742035836929,hateful
13,847945888995708928,hateful
21,847804507367100416,hateful
...,...,...
79853,849246676854484992,hateful
79909,849635522389651456,hateful
79917,850456293181538304,hateful
79922,850308804663181312,hateful


In [9]:
hate_ids = group_list(list(df_hateful.id))
len(hate_ids)

37

In [37]:
df_hate = tweets_request(hate_ids)

100%|██████████| 37/37 [00:15<00:00,  2.45it/s]


In [38]:
df_hate = df_hate.reset_index(drop=True)
df_hate

Unnamed: 0,author_id,created_at,text,id
0,399817418,2017-04-07T19:42:40.000Z,I hate er chase because if the Bitch that work...,850433664890544128
1,928512230,2017-04-06T21:39:45.000Z,RT @nyctophil3: Pineapples do not belong on pi...,850100742035836929
2,46267097,2017-03-31T13:35:20.000Z,Niggas keep talking about women wearing weave ...,847804507367100416
3,14413604,2017-04-06T20:58:20.000Z,@vappywave idiot that's not gonna work. you go...,850090319165050880
4,3137373930,2017-04-03T00:00:48.000Z,RT @ayevonnn: bruh i fucking hate people like ...,848686686930382848
...,...,...,...,...
1761,465448529,2017-04-08T13:25:50.000Z,Baekhyun won 'Most Popular Artist of Korea'\n\...,850701219568001025
1762,179136092,2017-04-04T17:42:11.000Z,#TripleTalaqArrest 2fgt dis social menace #Tri...,849316180670259205
1763,60473620,2017-04-05T17:26:51.000Z,White nationalists' latest tactic to recruit c...,849674709763358720
1764,736124767518986244,2017-04-04T13:06:00.000Z,We are in a comfortable lead fight against gal...,849246676854484992


In [39]:
df_hate.to_csv('../data/df_hateful.csv')

# University of Copenhagen Dataset

** Reference: **

Waseem, Z., Hovy, D. (2016). Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter \[Data file\]. In: *Proceedings of the NAACL Student Research Workshop*. San Diego, Calfornia: Association for Computational Linguistics, pages 88-93. Retrieved from: https://github.com/ZeerakW/hatespeech.

Publication:  https://www.aclweb.org/anthology/N16-2013.pdf

In [11]:
df2 = pd.read_csv('../data/NAACL_SRW_2016.csv')
df2.columns = ['id', 'label']
df2.head()

Unnamed: 0,id,label
0,572341498827522049,racism
1,572340476503724032,racism
2,572334712804384768,racism
3,572332655397629952,racism
4,575949086055997440,racism


In [48]:
df2.label.value_counts()

none      11559
sexism     3378
racism     1969
Name: label, dtype: int64

In [51]:
df_racsex = df2[(df2['label']=='racism') | (df2['label']=='sexism')]

In [55]:
racsex_id = group_list(list(df_racsex.id))
df_rac_sex = tweets_request(racsex_id)

100%|██████████| 54/54 [00:22<00:00,  2.42it/s]


In [62]:
df_rac_sex = df_rac_sex.reset_index(drop=True)
df_rac_sex

Unnamed: 0,id,author_id,created_at,text
0,572341498827522049,110114783,2015-03-02T10:23:41.000Z,Drasko they didn't cook half a bird you idiot ...
1,572340476503724032,38650214,2015-03-02T10:19:37.000Z,Hopefully someone cooks Drasko in the next ep ...
2,572334712804384768,2587278392,2015-03-02T09:56:43.000Z,of course you were born in serbia...you're as ...
3,572332655397629952,2601524623,2015-03-02T09:48:33.000Z,These girls are the equivalent of the irritati...
4,446460991396917248,930620467,2014-03-20T01:39:29.000Z,RT @YesYoureRacist: At least you're only a tin...
...,...,...,...,...
2736,569990621315919872,2756873076,2015-02-23T22:42:08.000Z,"RT @Superjutah: @MT8_9 £34,000 is £9000 above ..."
2737,570244879265206272,2756873076,2015-02-24T15:32:28.000Z,RT @DufresneFrank: “@gabystama: @DufresneFrank...
2738,571013698111860736,2756873076,2015-02-26T18:27:29.000Z,RT @BoycottBrandy: @ActionFlickDoc this person...
2739,571304517930774528,2756873076,2015-02-27T13:43:06.000Z,RT @Lyall: Patriarchy™ will expel me if I divu...


In [63]:
df_rac_sex.to_csv('../data/df_ras_sex_hate.csv')

In [16]:
df3 = pd.read_csv("../data/hate.csv")
df3.columns = ['id', 'user', 'hate', 'chate', 'neutral', 'label']

In [19]:
df_anti_asian = df3.sort_values(by = ['hate'], ascending=False)
df_anti_asian.label = 1
df_anti_asian

Unnamed: 0,id,user,hate,chate,neutral,label
560314,1240976182494908416,884055068265164801,0.999974,0.000520,0.000382,1
515476,1238239331879464960,1136405401770106880,0.999966,0.000625,0.000105,1
396712,1247813175996792834,1234197517006450688,0.999959,0.003865,0.000018,1
424027,1245317927823233024,1232772980763054081,0.999958,0.001476,0.000318,1
380359,1241373862756888577,907414928,0.999952,0.018423,0.000788,1
...,...,...,...,...,...,...
596002,1236335083738537985,4384833496,0.500002,0.004678,0.297274,1
334098,1250769238274572291,1199735925355556864,0.500002,0.087211,0.045595,1
889914,1232245722831052801,1053001766806200322,0.500001,0.004162,0.253713,1
198091,1249377836646924291,223299902,0.500001,0.003931,0.210026,1


In [20]:
anti_asian_ids = group_list(list(df_anti_asian.id))
len(anti_asian_ids)

8913

In [21]:
asian_100 = anti_asian_ids[0:100]
asian_200 = anti_asian_ids[100:200]
asian_300 = anti_asian_ids[200:300]

In [24]:
df_asian_1 = tweets_request(asian_100)

  0%|          | 0/100 [00:00<?, ?it/s]


AttributeError: module 'config' has no attribute 'bearer_token'