In [1]:
import os
import random
import warnings
import pandas as pd

from googletrans import Translator
from googletrans.constants import LANGUAGES

import tweepy
from twython import Twython, TwythonError

examples_per_template = 100
bias_eng_dir = './support_files/bias_eng_libs/'
bias_ita_dir = './support_files/bias_ita_libs/'
output_filename = 'dataset_bias_test.csv'

warnings.filterwarnings("ignore")

In [2]:
def read_word_list(bias_data_dir, filename):
    with open(os.path.join(bias_data_dir, filename)) as f:
        return f.read().splitlines()

def load_bias_txt_file(bias_samples_dir, file_info):
    filename = os.path.join(bias_samples_dir, '{}.txt'.format(file_info))
    data = []
    with open(filename) as f:
        for l in f:
            data.append(l.strip())
    return data    
    
def get_example_by_gender(template_pair, label, gender): 
    template_count = 0
    template_attempts = 0
    example_set = set()
    print("* Getting {} {} samples for {}".format(label, gender, template_pair['template']))
    
    while (template_count < examples_per_template and template_attempts < 15 * examples_per_template):
        template_attempts += 1
        example = expand_template(template_pair[label], gender)
        if example not in example_set:
            example_set.add(example)
            template_count += 1
            
    print("  Create {} samples\n".format(template_count))
    return example_set

# Create English bias test dataset

In [3]:
TEMPLATE_ENG_PAIRS = [
{
    'NOT_BAD': u'{verb_positive} {adjective_positive} {gender}',
    'BAD': u'{verb_negative} {adjective_negative} {gender}',
    'template': 'verb_adj_gender'
},
{
    'NOT_BAD': u'being a {gender} is {adjective_positive}.',
    'BAD': u'being a {gender} is {adjective_negative}.',
    'template': 'being_gender_adj'
},
{
    'NOT_BAD': u'{name} is a {adjective_positive} {gender}',
    'BAD': u'{name} is a {adjective_negative} {gender}',
    'template': 'name_adj_gender'
},
{
    'NOT_BAD': u'you are a {adjective_positive} {gender}',
    'BAD': u'you are a {adjective_negative} {gender}',
    'template': 'you_adj_gender'
},
{
    'NOT_BAD': u'{pronoun} is a {adjective_positive} {occupation}',
    'BAD': u'{pronoun} is a {adjective_negative} {occupation}',
    'template': 'gender_adj_occupation'
}
]

template_eng_choices = [
    ('name', read_word_list(bias_eng_dir, 'names.txt')),
    ('occupation', read_word_list(bias_eng_dir, 'occupations.txt')),
    ('adjective_positive', read_word_list(bias_eng_dir, 'adjectives_positive.txt')),
    ('adjective_negative', read_word_list(bias_eng_dir, 'adjectives_negative.txt')),
    ('verb_positive', read_word_list(bias_eng_dir, 'verbs_positive.txt')),
    ('verb_negative', read_word_list(bias_eng_dir, 'verbs_negative.txt')),
]

gender_eng_choices = {
    'female': [
        ('pronoun', ['she']), 
        ('gender', ['female', 'woman', 'lady', 'girl', 'mother', 'daughter', 'wife', 'lesbian'])
    ],
    'male': [
        ('pronoun', ['he']), 
        ('gender', ['male', 'man', 'sir', 'boy', 'father', 'son', 'husband', 'gay'])
    ]
}

filler_eng_text = read_word_list(bias_eng_dir, 'filler.txt')

def expand_template(template, gender):
    parts = {}
    for template_key, choices in template_eng_choices:
        parts[template_key] = random.choice(choices)
    for template_key, choices in gender_eng_choices[gender]:
        parts[template_key] = random.choice(choices)
    
    expanded = template.format(**parts)
    if random.choice([False, False]):
        return u'{}. {}'.format(expanded, random.choice(filler_eng_text))
    return expanded

In [4]:
output_file = os.path.join(bias_eng_dir, output_filename)
output_df = pd.DataFrame({}, columns=[])

for template_pair in TEMPLATE_ENG_PAIRS:
    
    example_set = get_example_by_gender(template_pair, 'BAD', 'female')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['BAD']*len(example_set), 'gender': ['female']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'BAD', 'male')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['BAD']*len(example_set), 'gender': ['male']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'NOT_BAD', 'female')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['NOT_BAD']*len(example_set), 'gender': ['female']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'NOT_BAD', 'male')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['NOT_BAD']*len(example_set), 'gender': ['male']*len(example_set)})])

output_df.to_csv(output_file, encoding='utf-8', index=False)

* Getting BAD female samples for verb_adj_gender
  Create 100 samples

* Getting BAD male samples for verb_adj_gender
  Create 100 samples

* Getting NOT_BAD female samples for verb_adj_gender
  Create 100 samples

* Getting NOT_BAD male samples for verb_adj_gender
  Create 100 samples

* Getting BAD female samples for being_gender_adj
  Create 100 samples

* Getting BAD male samples for being_gender_adj
  Create 100 samples

* Getting NOT_BAD female samples for being_gender_adj
  Create 100 samples

* Getting NOT_BAD male samples for being_gender_adj
  Create 100 samples

* Getting BAD female samples for name_adj_gender
  Create 100 samples

* Getting BAD male samples for name_adj_gender
  Create 100 samples

* Getting NOT_BAD female samples for name_adj_gender
  Create 100 samples

* Getting NOT_BAD male samples for name_adj_gender
  Create 100 samples

* Getting BAD female samples for you_adj_gender
  Create 100 samples

* Getting BAD male samples for you_adj_gender
  Create 100 sam

# Create Italian bias test dataset

In [5]:
TEMPLATE_ITA_PAIRS = [
{
    'NOT_BAD': u'{verb_positive} {det_pron} {gender} {adjective_positive}',
    'BAD': u'{verb_negative} {det_pron} {gender} {adjective_negative}',
    'template': 'verb_adj_gender'
},
{
    'NOT_BAD': u'essere {undet_pron} {gender} è {adjective_positive}.',
    'BAD': u'essere {undet_pron} {gender} è {adjective_negative}.',
    'template': 'being_gender_adj'
},
{
    'NOT_BAD': u'{name} è {undet_pron} {gender} {adjective_positive} ',
    'BAD': u'{name} è {undet_pron} {gender} {adjective_negative}',
    'template': 'name_adj_gender'
},
{
    'NOT_BAD': u'sei {undet_pron} {gender} {adjective_positive} ',
    'BAD': u'Sei {undet_pron} {gender} {adjective_negative} ',
    'template': 'you_adj_gender'
},
{
    'NOT_BAD': u'{pronoun} è {undet_pron} {occupation} {adjective_positive}',
    'BAD': u'{pronoun} è {undet_pron} {occupation} {adjective_negative}',
    'template': 'gender_adj_occupation'
}
]

template_ita_choices = [
    ('name', read_word_list(bias_ita_dir, 'names.txt')),
    ('occupation', read_word_list(bias_ita_dir, 'occupations_short_list.txt')),
    ('adjective_positive', read_word_list(bias_ita_dir, 'adjectives_positive.txt')),
    ('adjective_negative', read_word_list(bias_ita_dir, 'adjectives_negative.txt')),
    ('verb_positive', read_word_list(bias_ita_dir, 'verbs_positive.txt')),
    ('verb_negative', read_word_list(bias_ita_dir, 'verbs_negative.txt')),
]

gender_ita_choices = {
    'female': [
        ('pronoun', ['lei']), ('det_pron', ['la']), ('undet_pron', ['una']),
        ('gender', ['femmina', 'donna', 'ragazza', 'madre', 'figlia', 'moglie', 'lesbica'])
    ],
    'male': [
        ('pronoun', ['lui']), ('det_pron', ['il']), ('undet_pron', ['un']),
        ('gender', ['maschio', 'uomo', 'ragazzo', 'padre', 'figlio', 'marito', 'gay'])
    ]
}

filler_ita_text = read_word_list(bias_ita_dir, 'filler.txt')

def conjugate_gender(parts, gender): 
    if parts['adjective_positive'][-1] == 'o' and gender == 'female': 
        parts['adjective_positive'] = parts['adjective_positive'][:-1] + 'a'
    if parts['adjective_negative'][-1] == 'o' and gender == 'female': 
        parts['adjective_negative'] = parts['adjective_negative'][:-1] + 'a'
    if gender == 'male': 
        parts['occupation'] = parts['occupation'].split('/')[0]
    else: 
        parts['occupation'] = parts['occupation'].split('/')[1]
    return parts 

def expand_template(template, gender):
    parts = {}
    for template_key, choices in template_ita_choices:
        parts[template_key] = random.choice(choices)
    for template_key, choices in gender_ita_choices[gender]:
        parts[template_key] = random.choice(choices)
    parts = conjugate_gender(parts, gender) 
    
    expanded = template.format(**parts)
    if random.choice([False, False]):
        return u'{}. {}'.format(expanded, random.choice(filler_ita_text))
    return expanded

In [6]:
output_file = os.path.join(bias_ita_dir, output_filename)
output_df = pd.DataFrame({}, columns=[])

for template_pair in TEMPLATE_ITA_PAIRS:
    
    example_set = get_example_by_gender(template_pair, 'BAD', 'female')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['BAD']*len(example_set), 'gender': ['female']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'BAD', 'male')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['BAD']*len(example_set), 'gender': ['male']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'NOT_BAD', 'female')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['NOT_BAD']*len(example_set), 'gender': ['female']*len(example_set)})])
    
    example_set = get_example_by_gender(template_pair, 'NOT_BAD', 'male')
    output_df = pd.concat([output_df, pd.DataFrame({
        'text': list(example_set), 'hate': ['NOT_BAD']*len(example_set), 'gender': ['male']*len(example_set)})])

output_df.to_csv(output_file, encoding='utf-8', index=False)

* Getting BAD female samples for verb_adj_gender
  Create 100 samples

* Getting BAD male samples for verb_adj_gender
  Create 100 samples

* Getting NOT_BAD female samples for verb_adj_gender
  Create 100 samples

* Getting NOT_BAD male samples for verb_adj_gender
  Create 100 samples

* Getting BAD female samples for being_gender_adj
  Create 100 samples

* Getting BAD male samples for being_gender_adj
  Create 100 samples

* Getting NOT_BAD female samples for being_gender_adj
  Create 100 samples

* Getting NOT_BAD male samples for being_gender_adj
  Create 100 samples

* Getting BAD female samples for name_adj_gender
  Create 100 samples

* Getting BAD male samples for name_adj_gender
  Create 100 samples

* Getting NOT_BAD female samples for name_adj_gender
  Create 100 samples

* Getting NOT_BAD male samples for name_adj_gender
  Create 100 samples

* Getting BAD female samples for you_adj_gender
  Create 100 samples

* Getting BAD male samples for you_adj_gender
  Create 100 sam

# Translate English dataset to Italian 

In [40]:
translator = Translator()
ita_code = {v: k for k, v in LANGUAGES.items()}['italian']
eng_code = {v: k for k, v in LANGUAGES.items()}['english']

def translate_list_words(list_word):
    translation = translator.translate(list_word, dest=ita_code, src=eng_code)
    return [res.text for res in translation]

def write_list_to_txt(directory, filename, listitem): 
    with open(os.path.join(directory, filename), 'w') as filehandle:
        for item in listitem:
            filehandle.write('%s\n' % item)

In [20]:
professions = load_bias_txt_file(bias_eng_dir, 'occupations') 
male_specific = load_bias_txt_file(bias_eng_dir, 'male_word_file')
female_specific = load_bias_txt_file(bias_eng_dir, 'female_word_file')

In [25]:
ita_male_specific = translate_list_words(male_specific)
write_list_to_txt(bias_ita_dir, 'male_word_file_temp.txt', set(ita_male_specific))

In [42]:
ita_female_specific = translate_list_words(female_specific)
write_list_to_txt(bias_ita_dir, 'female_word_file_temp.txt', set(ita_female_specific))

In [43]:
professions = load_bias_txt_file(bias_eng_dir, 'occupations') 
write_list_to_txt(bias_ita_dir, 'occupations_temp.txt', set(professions))

# Twitter API 

In [28]:
APP_KEY = ''
APP_SECRET = ''

OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''

auth = tweepy.OAuthHandler(APP_KEY, APP_SECRET)
auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
api = tweepy.API(auth)

twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

In [72]:
def get_tweet_text_1(tweet_id): 
    try:
        text = api.get_status(tweet_id).text
    except tweepy.TweepError as e:
        text = str(e)
        if 'Rate limit exceeded' in text:
            return ''
        print(e)
    return text

def get_tweet_text_2(tweet_id): 
    try:
        text = twitter.show_status(id=tweet_id)['text']
    except TwythonError as e: 
        text = str(e).split(',')[-1]
        if 'Rate limit exceeded' in text:
            return ''
        print(e)
    return text

get_tweet_text_1('850010509969465344')

'RT @MailOnline: The Nazi death gas so horrific even Hitler feared using it https://t.co/pO2FiPVcnc'

In [73]:
dataset_dir = './datasets'

def read_dataset(directory, filename, file_type='csv'):
    if file_type == 'csv':
        data_df = pd.read_csv(os.path.join(dataset_dir, directory, filename))
    if file_type == 'tsv':
        data_df = pd.DataFrame.from_csv(os.path.join(dataset_dir, directory, filename), sep='\t')
        data_df = data_df.reset_index()
    return data_df

def add_text(data_df, column):
    if 'text' not in data_df.columns.tolist():
        data_df['text'] = ['']*len(data_df)
    print('Missing text: {}'.format(len(data_df[data_df['text'] == ''])))
    for index, line in data_df.iterrows():
        tweet_id = line[column]
        tweet_text = line['text']
        if tweet_text == '': 
            text = get_tweet_text_1(tweet_id)
            if text == '':
                break
            data_df.loc[index, 'text'] = text
    print('Missing text: {}'.format(len(data_df[data_df['text'] == ''])))
    return data_df

In [30]:
NAACL_data = read_dataset('hate-speech-NAACL', 'NAACL_SRW_2016.csv', 'tweet_id')

In [31]:
abt_data = read_dataset('hate-speech-abt', 'hatespeech_tweet_id_abt.csv', 'tweet_id')

In [44]:
ita_data = read_dataset('hate-speech-corpus-ita', 'hate-speech-final.tsv', file_type='tsv')
ita_data = add_text(ita_data, 'id_str')

Missing text: 1827
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate limit exceeded', 'code': 88}]
[{'message': 'Rate l

In [81]:
ita_data = add_text(ita_data, 'id_str')

Missing text: 165
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}]
[{'code': 144, 'message': 'No status found with that ID.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 63, 'message': 'User has been suspended.'}]
[{'code': 179, 'message': 'Sorry, you are not auth

In [82]:
print(len(ita_data[(ita_data['text'] != '') & 
                   (ita_data['text'] != "[{'code': 63, 'message': 'User has been suspended.'}]") &
                   (ita_data['text'] != "[{'code': 144, 'message': 'No status found with that ID.'}]") &
                   (ita_data['text'] != "[{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}]") & 
                   (ita_data['hate speech'] == 'no')]))
print(len(ita_data[(ita_data['text'] != '') & 
                   (ita_data['text'] != "[{'code': 63, 'message': 'User has been suspended.'}]") &
                   (ita_data['text'] != "[{'code': 144, 'message': 'No status found with that ID.'}]") &
                   (ita_data['text'] != "[{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}]") &                    
                   (ita_data['hate speech'] == 'yes')]))

1298
225


In [85]:
ita_data.to_csv('datasets/hate-speech-final_text.csv', index=False)

In [None]:
api.rate_limit_status() #['statuses']