<a href="https://colab.research.google.com/github/dbckz/crossing-the-line/blob/master/notebooks/processing_full_hatebase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import pandas as pd
import ast
import os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm
from google.colab import drive
import plotly.graph_objects as go

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Set up paths
root_path = "/content/drive/MyDrive/University/Dissertation/data_collection"
graph_path = root_path + "/graphs"

day_paths = day_paths = [
        "/01",
        "/02",
        "/03",
        "/04",
        "/05",
        "/06",
        "/07",
        "/08",
        "/09",
        "/10",
        "/11",
        "/12",
        "/13",
        "/14",
        "/15",
        "/16",
        "/17",
        "/18",
        "/19",
        "/20",
        "/21",
        "/22",
        "/23",
        "/24",
        "/25",
        "/26",
        "/27",
        "/28",
        "/29",
        "/30",
        "/31",
        "/32",
        "/33",
        "/34",
        "/35",
        "/36"
    ]

In [4]:
# Create directory to store visualisations
try:
    os.mkdir(graph_path)
except OSError as error:
    print(error)

[Errno 17] File exists: '/content/drive/MyDrive/University/Dissertation/data_collection/graphs'


In [5]:
# Load data
threshold = '90'
in_tweets = pd.DataFrame()
hb_guard = pd.DataFrame()
emojis = pd.DataFrame()
for path in day_paths:
    directory = root_path + path
    tweets_csv = directory + "/tweets.csv"
    matched_terms_csv = directory + "/hatebase_processed_tweets.csv"
    emojis_csv = directory + "/emoji.csv"

    print(f"Loading CSVs for directory {path}...")
    in_tweets = pd.concat([in_tweets, 
                           pd.read_csv(tweets_csv,
                                       usecols = [
                                                  'created_at',
                                                  'tweet_id',
                                                  'tweet_text',
                                                  'accounts_mentioned'
                                       ],
                                       dtype = {
                                          # 'created_at':
                                          'tweet_id': np.int64,
                                          'tweet_text': str,
                                          'accounts_mentioned': object
                                       },
                                       parse_dates=['created_at'])])

    hb_guard = pd.concat([hb_guard, pd.read_csv(matched_terms_csv,
                                                usecols = [
                                                           'tweet_id',
                                                           f'matching_hatebase_terms_over_{threshold}',
                                                           f'matching_hatebase_terms_ethnicity_over_{threshold}',
                                                           f'matching_hatebase_terms_nationality_over_{threshold}',
                                                           f'matching_hatebase_terms_gender_over_{threshold}',
                                                           f'matching_hatebase_terms_sexual_orientation_over_{threshold}',
                                                           f'matching_hatebase_terms_class_over_{threshold}',
                                                           f'matching_hatebase_terms_religion_over_{threshold}',
                                                           f'matching_hatebase_terms_disability_over_{threshold}'
                                                ],
                                                dtype = {
                                                    'tweet_id': np.int64,
                                                    f'matching_hatebase_terms_over_{threshold}': str,
                                                    f'matching_hatebase_terms_ethnicity_over_{threshold}': str,
                                                    f'matching_hatebase_terms_nationality_over_{threshold}': str,
                                                    f'matching_hatebase_terms_gender_over_{threshold}': str,
                                                    f'matching_hatebase_terms_sexual_orientation_over_{threshold}': str,
                                                    f'matching_hatebase_terms_class_over_{threshold}': str,
                                                    f'matching_hatebase_terms_religion_over_{threshold}': str,
                                                    f'matching_hatebase_terms_disability_over_{threshold}': str
                                                })])

    emojis = pd.concat([emojis,
                        pd.read_csv(emojis_csv,
                                    dtype = {
                                        'tweet_id': np.int64,
                                        'banana_count': np.int16,
                                        'monkey_count': np.int16,
                                        'monkey_face_count': np.int16,
                                        'speak_no_evil_monkey_count': np.int16,
                                        'hear_no_evil_monkey_count': np.int16,
                                        'see_no_evil_monkey_count': np.int16,
                                        'gorilla_count': np.int16,
                                        'watermelon_count': np.int16,
                                        'total_emoji_count': np.int16
                                    }
                                    )])

# Dedup
original_tweets_length = len(in_tweets)
original_hatebase_length = len(hb_guard)
original_emojis_length = len(emojis)
in_tweets.drop_duplicates(subset=['tweet_id'], inplace=True)
hb_guard.drop_duplicates(subset=['tweet_id'], inplace=True)
emojis.drop_duplicates(subset=['tweet_id'], inplace=True)
print(f"Size of tweets dataframe: {len(in_tweets)}, having dropped {original_tweets_length - len(in_tweets)} duplicate rows")
print(f"Size of hatebase dataframe: {len(hb_guard)}, having dropped {original_hatebase_length - len(hb_guard)} duplicate rows")
print(f"Size of emojis dataframe: {len(emojis)}, having dropped {original_emojis_length - len(emojis)} duplicate rows")


Loading CSVs for directory /01...
Loading CSVs for directory /02...
Loading CSVs for directory /03...
Loading CSVs for directory /04...
Loading CSVs for directory /05...
Loading CSVs for directory /06...
Loading CSVs for directory /07...
Loading CSVs for directory /08...
Loading CSVs for directory /09...
Loading CSVs for directory /10...
Loading CSVs for directory /11...
Loading CSVs for directory /12...
Loading CSVs for directory /13...
Loading CSVs for directory /14...
Loading CSVs for directory /15...
Loading CSVs for directory /16...
Loading CSVs for directory /17...
Loading CSVs for directory /18...
Loading CSVs for directory /19...
Loading CSVs for directory /20...
Loading CSVs for directory /21...
Loading CSVs for directory /22...
Loading CSVs for directory /23...
Loading CSVs for directory /24...
Loading CSVs for directory /25...
Loading CSVs for directory /26...
Loading CSVs for directory /27...
Loading CSVs for directory /28...
Loading CSVs for directory /29...
Loading CSVs f

In [6]:
# Up the pandas display limits so printed dataframes aren't so truncated
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_info_rows', 100)
pd.set_option('display.max_info_columns', 100)

# Data manipulation

In [7]:
# Join tables + drop old ones!
joined_df = pd.merge(in_tweets, emojis, how='outer', on='tweet_id')
joined_df = pd.merge(joined_df, hb_guard, how='outer', on='tweet_id')
del emojis
del hb_guard
del in_tweets

In [8]:
joined_df = joined_df[(joined_df['created_at'] > '2021-06-19 08:10:18+00:00') & (joined_df['created_at'] < '2021-07-17 00:00:00+00:00')]
len(joined_df)

1274885

In [9]:
# Create a column indicating whether tweet contains slurs (hacky > 2 as empty list is stored as string "[]")
joined_df['contains_slurs'] = joined_df[f'matching_hatebase_terms_over_{threshold}'].str.len() > 2
joined_df['contains_ethnicity_slurs'] = joined_df[f'matching_hatebase_terms_ethnicity_over_{threshold}'].str.len() > 2
joined_df['contains_nationality_slurs'] = joined_df[f'matching_hatebase_terms_nationality_over_{threshold}'].str.len() > 2
joined_df['contains_gender_slurs'] = joined_df[f'matching_hatebase_terms_gender_over_{threshold}'].str.len() > 2
joined_df['contains_sexual_orientation_slurs'] = joined_df[f'matching_hatebase_terms_sexual_orientation_over_{threshold}'].str.len() > 2
joined_df['contains_class_slurs'] = joined_df[f'matching_hatebase_terms_class_over_{threshold}'].str.len() > 2
joined_df['contains_religion_slurs'] = joined_df[f'matching_hatebase_terms_religion_over_{threshold}'].str.len() > 2
joined_df['contains_disability_slurs'] = joined_df[f'matching_hatebase_terms_disability_over_{threshold}'].str.len() > 2


In [10]:
emoji_tweet_file = root_path + "/emoji_tweets.csv"

banana_tweet_count = joined_df['banana_count'][joined_df['banana_count'] != 0].count()
monkey_tweet_count = joined_df['monkey_count'][joined_df['banana_count'] != 0].count()
monkey_face_tweet_count = joined_df['monkey_face_count'][joined_df['monkey_face_count'] != 0].count()
speak_no_evil_monkey_tweet_count = joined_df['speak_no_evil_monkey_count'][joined_df['speak_no_evil_monkey_count'] != 0].count()
hear_no_evil_monkey_tweet_count = joined_df['hear_no_evil_monkey_count'][joined_df['hear_no_evil_monkey_count'] != 0].count()
see_no_evil_monkey_tweet_count = joined_df['see_no_evil_monkey_count'][joined_df['see_no_evil_monkey_count'] != 0].count()
gorilla_tweet_count = joined_df['gorilla_count'][joined_df['gorilla_count'] != 0].count()
watermelon_tweet_count = joined_df['watermelon_count'][joined_df['watermelon_count'] != 0].count()

banana_total = joined_df['banana_count'][joined_df['banana_count'] != 0].sum()
monkey_total = joined_df['monkey_count'][joined_df['monkey_count'] != 0].sum()
monkey_face_total = joined_df['monkey_face_count'][joined_df['monkey_face_count'] != 0].sum()
speak_no_evil_monkey_total = joined_df['speak_no_evil_monkey_count'][joined_df['speak_no_evil_monkey_count'] != 0].sum()
hear_no_evil_monkey_total = joined_df['hear_no_evil_monkey_count'][joined_df['hear_no_evil_monkey_count'] != 0].sum()
see_no_evil_monkey_total = joined_df['see_no_evil_monkey_count'][joined_df['see_no_evil_monkey_count'] != 0].sum()
gorilla_total = joined_df['gorilla_count'][joined_df['gorilla_count'] != 0].sum()
watermelon_total = joined_df['watermelon_count'][joined_df['watermelon_count'] != 0].sum()

total_emoji_tweets = joined_df.query('banana_count != 0 or monkey_count != 0 or monkey_face_count != 0 or speak_no_evil_monkey_count != 0 or hear_no_evil_monkey_count != 0 or see_no_evil_monkey_count != 0 or gorilla_count != 0 or watermelon_count != 0')['tweet_id'].count()
total_emoji_count = joined_df['total_emoji_count'][joined_df['total_emoji_count'] != 0].sum()

joined_df[['tweet_id', 'tweet_text']][joined_df['total_emoji_count'] > 0].to_csv(emoji_tweet_file, index=False)
joined_df['contains_emoji_slurs'] = joined_df['total_emoji_count'] > 0

In [11]:
# NOTE: This shouldn't go here - this should be part of the evaluation process

# # NOTE: at this point you need to manually review the emoji tweets in emoji_tweets.csv, and put them into a emoji_tweets_reviewed.csv

# reviewed_emojis = pd.read_csv(root_path + '/emoji_tweets_reviewed.csv')
# reviewed_emojis['manually_reviewed_emoji_is_offensive'] = True
# reviewed_emojis.drop('tweet_text', axis=1, inplace=True)
# joined_df = pd.merge(joined_df, reviewed_emojis, how='outer', on='tweet_id')
# joined_df['manually_reviewed_emoji_is_offensive'] = joined_df['manually_reviewed_emoji_is_offensive'].fillna(False)

In [12]:
# Extract players
england = ["JPickford1", "kylewalker2", "LukeShaw23", "_DeclanRice", "HarryMaguire93", "JackGrealish",
                    "JHenderson", "HKane", "sterling7", "MarcusRashford", "trippier2", "deanhenderson",
                    "Kalvinphillips", "OfficialTM_3", "Sanchooo10", "CalvertLewin14", "masonmount_10", "PhilFoden",
                    "BenChilwell", "ben6white", "samjohnstone50", "reecejames_24", "BukayoSaka87", "BellinghamJude"]

netherlands = ["joel_veltman", "mdeligt_04", "NathanAke", "Stefandevrij", "GWijnaldum", "LuukdeJong9", "Memphis", "QPromes", "pvanaanholt", "TimKrul", "DavyKlaassen", "Dirono", "RGravenberch", "BlindDaley", "DeJongFrenkie21", "DenzelJMD2"]

germany = ["Manuel_Neuer", "ToniRuediger", "MatzeGinter", "matshummels", "kaihavertz29", "ToniKroos", "KeVolland", "SergeGnabry", "Bernd_Leno", "JamalMusiala", "lukaskl96", "leongoretzka_", "leroy_sane", "IlkayGuendogan", "emrecan_", "RobinKoch25", "esmuellert_"]

scotland = ["MarshallDavid23", "sodonnell15", "andrewrobertso5", "mctominay10", "granthanley5", "kierantierney1", "jmcginn7", "Callummcgregor8", "Lyndon_Dykes", "CheAdams_", "CraigGordon01", "declang31", "LiamCooper__", "10DavidTurnbull", "kevinnisbet16", "np4tterson", "billygilmourrr", "Jack_Hendry2", "Scottmckenna3"]

france = ["BenPavard28", "kimpembe_3", "raphaelvarane", "clement_lenglet", "paulpogba", "AntoGriezmann", "_OlivierGiroud_", "KMbappe", "CorentinTolisso", "nglkante", "KurtZouma", "SteveMandanda", "MoussaSissoko", "LucasDigne", "Benzema", "LucasHernandez", "WissBenYedder", "mmseize", "leodubois15", "jkeey4", "MarcusThuram"]

belgium = ["thibautcourtois", "AlderweireldTob", "thomasvermaelen", "JanVertonghen", "axelwitsel28", "DeBruyneKev", "RomeluLukaku9", "hazardeden10", "CarrascoY21", "SMignolet", "dries_mertens14", "ThomMills", "HazardThorgan8", "VanakenHans", "Jasondenayer", "chrisbenteke", "NChadli", "mbatshuayi", "LTrossard", "JeremyDoku", "dennispraet"]

list_of_players = england + netherlands + germany + scotland + france + belgium

for player in list_of_players:
    print(f"Extracting {player}...")
    joined_df[player] = joined_df['accounts_mentioned'].str.contains(f"'username': '{player}'").astype(np.bool)

# player_tweet_map = pd.DataFrame(columns=["username", "tweets_received"])

# i = 0
# for player in list_of_players:
#     tweets = joined_df[player].sum()
#     player_tweet_map.loc[i] = player, tweets
#     i += 1

# player_tweet_map.sort_values('tweets_received', axis=0, ascending=False, inplace=True)



Extracting JPickford1...
Extracting kylewalker2...
Extracting LukeShaw23...
Extracting _DeclanRice...
Extracting HarryMaguire93...
Extracting JackGrealish...
Extracting JHenderson...
Extracting HKane...
Extracting sterling7...
Extracting MarcusRashford...
Extracting trippier2...
Extracting deanhenderson...
Extracting Kalvinphillips...
Extracting OfficialTM_3...
Extracting Sanchooo10...
Extracting CalvertLewin14...
Extracting masonmount_10...
Extracting PhilFoden...
Extracting BenChilwell...
Extracting ben6white...
Extracting samjohnstone50...
Extracting reecejames_24...
Extracting BukayoSaka87...
Extracting BellinghamJude...
Extracting joel_veltman...
Extracting mdeligt_04...
Extracting NathanAke...
Extracting Stefandevrij...
Extracting GWijnaldum...
Extracting LuukdeJong9...
Extracting Memphis...
Extracting QPromes...
Extracting pvanaanholt...
Extracting TimKrul...
Extracting DavyKlaassen...
Extracting Dirono...
Extracting RGravenberch...
Extracting BlindDaley...
Extracting DeJongFren

In [13]:
# Sort by ascending date
joined_df.sort_values('created_at', axis=0, inplace=True)
# joined_df['created_at'] = pd.to_datetime(joined_df['created_at'])

joined_df['contains_slurs_or_offensive_emoji'] = joined_df['contains_slurs'] | joined_df['contains_emoji_slurs']
# joined_df['contains_ethnicity_slurs'] = joined_df['contains_ethnicity_slurs'] | joined_df['manually_reviewed_emoji_is_offensive'] # assuming all emoji ones are racist

In [14]:
joined_df = joined_df[
    (joined_df["JPickford1"]) |
    (joined_df["kylewalker2"]) |
    (joined_df["LukeShaw23"]) |
    (joined_df["kylewalker2"]) |
    (joined_df["_DeclanRice"]) |
    (joined_df["HarryMaguire93"]) |
    (joined_df["JackGrealish"]) |
    (joined_df["JHenderson"]) |
    (joined_df["HKane"]) |
    (joined_df["sterling7"]) |
    (joined_df["MarcusRashford"]) |
    (joined_df["trippier2"]) |
    (joined_df["deanhenderson"]) |
    (joined_df["Kalvinphillips"]) |
    (joined_df["OfficialTM_3"]) |
    (joined_df["Sanchooo10"]) |
    (joined_df["CalvertLewin14"]) |
    (joined_df["masonmount_10"]) |
    (joined_df["PhilFoden"]) |
    (joined_df["BenChilwell"]) |
    (joined_df["ben6white"]) |
    (joined_df["samjohnstone50"]) |
    (joined_df["reecejames_24"]) |
    (joined_df["BukayoSaka87"]) |
    (joined_df["BellinghamJude"]) |
    (joined_df["joel_veltman"]) |
    (joined_df["mdeligt_04"]) |
    (joined_df["LukeShaw23"]) |
    (joined_df["NathanAke"]) |
    (joined_df["GWijnaldum"]) |
    (joined_df["LuukdeJong9"]) |
    (joined_df["Memphis"]) |
    (joined_df["QPromes"]) |
    (joined_df["pvanaanholt"]) |
    (joined_df["TimKrul"]) |
    (joined_df["DavyKlaassen"]) |
    (joined_df["Dirono"]) |
    (joined_df["RGravenberch"]) |
    (joined_df["BlindDaley"]) |
    (joined_df["DeJongFrenkie21"]) |
    (joined_df["DenzelJMD2"]) |
    (joined_df["Manuel_Neuer"]) |
    (joined_df["ToniRuediger"]) |
    (joined_df["MatzeGinter"]) |
    (joined_df["matshummels"]) |
    (joined_df["kaihavertz29"]) |
    (joined_df["ToniKroos"]) |
    (joined_df["KeVolland"]) |
    (joined_df["SergeGnabry"]) |
    (joined_df["Bernd_Leno"]) |
    (joined_df["JamalMusiala"]) |
    (joined_df["lukaskl96"]) |
    (joined_df["leongoretzka_"]) |
    (joined_df["leroy_sane"]) |
    (joined_df["IlkayGuendogan"]) |
    (joined_df["emrecan_"]) |
    (joined_df["RobinKoch25"]) |
    (joined_df["esmuellert_"]) |
    (joined_df["MarshallDavid23"]) |
    (joined_df["sodonnell15"]) |
    (joined_df["andrewrobertso5"]) |
    (joined_df["mctominay10"]) |
    (joined_df["granthanley5"]) |
    (joined_df["kierantierney1"]) |
    (joined_df["jmcginn7"]) |
    (joined_df["Callummcgregor8"]) |
    (joined_df["Lyndon_Dykes"]) |
    (joined_df["CheAdams_"]) |
    (joined_df["CraigGordon01"]) |
    (joined_df["declang31"]) |
    (joined_df["LiamCooper__"]) |
    (joined_df["10DavidTurnbull"]) |
    (joined_df["kevinnisbet16"]) |
    (joined_df["np4tterson"]) |
    (joined_df["billygilmourrr"]) |
    (joined_df["Jack_Hendry2"]) |
    (joined_df["Scottmckenna3"]) |
    (joined_df["BenPavard28"]) |
    (joined_df["kimpembe_3"]) |
    (joined_df["raphaelvarane"]) |
    (joined_df["clement_lenglet"]) |
    (joined_df["paulpogba"]) |
    (joined_df["AntoGriezmann"]) |
    (joined_df["_OlivierGiroud_"]) |
    (joined_df["KMbappe"]) |
    (joined_df["CorentinTolisso"]) |
    (joined_df["nglkante"]) |
    (joined_df["KurtZouma"]) |
    (joined_df["SteveMandanda"]) |
    (joined_df["MoussaSissoko"]) |
    (joined_df["LucasDigne"]) |
    (joined_df["Benzema"]) |
    (joined_df["LucasHernandez"]) |
    (joined_df["WissBenYedder"]) |
    (joined_df["mmseize"]) |
    (joined_df["leodubois15"]) |
    (joined_df["jkeey4"]) |
    (joined_df["ben6white"]) |
    (joined_df["MarcusThuram"]) |
    (joined_df["thibautcourtois"]) |
    (joined_df["AlderweireldTob"]) |
    (joined_df["thomasvermaelen"]) |
    (joined_df["JanVertonghen"]) |
    (joined_df["axelwitsel28"]) |
    (joined_df["DeBruyneKev"]) |
    (joined_df["RomeluLukaku9"]) |
    (joined_df["hazardeden10"]) |
    (joined_df["CarrascoY21"]) |
    (joined_df["SMignolet"]) |
    (joined_df["dries_mertens14"]) |
    (joined_df["ThomMills"]) |
    (joined_df["HazardThorgan8"]) |
    (joined_df["VanakenHans"]) |
    (joined_df["Jasondenayer"]) |
    (joined_df["chrisbenteke"]) |
    (joined_df["NChadli"]) |
    (joined_df["mbatshuayi"]) |
    (joined_df["LTrossard"]) |
    (joined_df["JeremyDoku"]) |
    (joined_df["dennispraet"])
    ]

In [15]:
total_off_tweets = joined_df['contains_slurs_or_offensive_emoji'].sum()
total_tweets = len(joined_df)
print(f"Total tweets containing slurs: {total_off_tweets}")
print(f"Total tweets: {total_tweets}")
print(f"Percentage of tweets containing slurs: {(100*total_off_tweets)/total_tweets}")

Total tweets containing slurs: 4970
Total tweets: 1046319
Percentage of tweets containing slurs: 0.47499854250950235


In [16]:
# Saving a file for manual review, taken at threshold 90 - one-off ad-hoc task
# joined_df[['tweet_id', 'tweet_text']][joined_df['contains_slurs_or_offensive_emoji'] == True].to_csv("/content/drive/MyDrive/University/Dissertation/evaluation/tweets_hb.csv", index=False)


In [17]:
# Maybe we don't need these cols and we can just calculate ad-hoc?
for player in list_of_players:
    joined_df[f'{player}_offensive'] = joined_df['contains_slurs_or_offensive_emoji'] & joined_df[player]
    # joined_df[f'{player}_ethnicity'] = joined_df['contains_ethnicity_slurs'] & joined_df[player]
    # joined_df[f'{player}_nationality'] = joined_df['contains_nationality_slurs'] & joined_df[player]
    # joined_df[f'{player}_gender'] = joined_df['contains_gender_slurs'] & joined_df[player]
    # joined_df[f'{player}_sexual_orientation'] = joined_df['contains_sexual_orientation_slurs'] & joined_df[player]
    # joined_df[f'{player}_class'] = joined_df['contains_class_slurs'] & joined_df[player]
    # joined_df[f'{player}_religion'] = joined_df['contains_religion_slurs'] & joined_df[player]
    # joined_df[f'{player}_disability'] = joined_df['contains_disability_slurs'] & joined_df[player]
    # joined_df[f'{player}_emoji'] = joined_df['contains_emoji_slurs'] & joined_df[player]

In [18]:
player_offensive_tweet_map = pd.DataFrame(columns=["username",
                                                   "tweets_received",
                                                   "offensive_tweets_received",
                                                  #  "ethnicity_tweets_received",
                                                  #  "nationality_tweets_received",
                                                  #  "gender_tweets_received",
                                                  #  "sexual_orientation_tweets_received",
                                                  #  "class_tweets_received",
                                                  #  "religion_tweets_received",
                                                  #  "disability_tweets_received",
                                                  #  "emoji_tweets_received",
                                                   "percentage_offensive"
                                                   ])

i = 0
for player in list_of_players:
    off_tweets = joined_df[player + '_offensive'].sum()
    # ethnicity_tweets = joined_df[player + '_ethnicity'].sum()
    # nationality_tweets = joined_df[player + '_nationality'].sum()
    # gender_tweets = joined_df[player + '_gender'].sum()
    # sexual_orientation_tweets = joined_df[player + '_sexual_orientation'].sum()
    # class_tweets = joined_df[player + '_class'].sum()
    # religion_tweets = joined_df[player + '_religion'].sum()
    # disability_tweets = joined_df[player + '_disability'].sum()
    # emoji_tweets = joined_df[player + '_emoji'].sum()
    tweets = joined_df[player].sum()
    percentage = 100 * (off_tweets / tweets)
    # player_offensive_tweet_map.loc[i] = player, tweets, off_tweets, ethnicity_tweets, nationality_tweets, gender_tweets, sexual_orientation_tweets, class_tweets, religion_tweets, disability_tweets, emoji_tweets, percentage
    player_offensive_tweet_map.loc[i] = player, tweets, off_tweets, percentage
    i += 1


invalid value encountered in long_scalars



In [19]:
# # Create per country map
# country_offensive_tweet_map = pd.DataFrame(columns=["country",
#                                                    "tweets_received",
#                                                    "offensive_tweets_received",
#                                                    "ethnicity_tweets_received",
#                                                    "nationality_tweets_received",
#                                                    "gender_tweets_received",
#                                                    "sexual_orientation_tweets_received",
#                                                    "class_tweets_received",
#                                                    "religion_tweets_received",
#                                                    "disability_tweets_received",
#                                                    "emoji_tweets_received",
#                                                    "percentage_offensive"
#                                                   ])

# country_offensive_tweet_map.loc[0] = "england", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[0]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[0]['offensive_tweets_received'] / country_offensive_tweet_map.loc[0]['tweets_received'])

# country_offensive_tweet_map.loc[1] = "netherlands", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(netherlands)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[1]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[1]['offensive_tweets_received'] / country_offensive_tweet_map.loc[1]['tweets_received'])

# country_offensive_tweet_map.loc[2] = "germany", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(germany)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[2]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[2]['offensive_tweets_received'] / country_offensive_tweet_map.loc[2]['tweets_received'])

# country_offensive_tweet_map.loc[3] = "france", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(france)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[3]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[3]['offensive_tweets_received'] / country_offensive_tweet_map.loc[3]['tweets_received'])

# country_offensive_tweet_map.loc[4] = "scotland", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(scotland)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[4]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[4]['offensive_tweets_received'] / country_offensive_tweet_map.loc[4]['tweets_received'])

# country_offensive_tweet_map.loc[5] = "belgium", \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['offensive_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['ethnicity_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['nationality_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['gender_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['sexual_orientation_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['class_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['religion_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(belgium)]['disability_tweets_received'].sum(), \
#                                      player_offensive_tweet_map[player_offensive_tweet_map['username'].isin(england)]['emoji_tweets_received'].sum(), \
#                                      0
                                     
# country_offensive_tweet_map.loc[5]['percentage_offensive'] = 100 * (country_offensive_tweet_map.loc[5]['offensive_tweets_received'] / country_offensive_tweet_map.loc[5]['tweets_received'])



In [20]:
# Create dataframe for analysing abuse by type
slurs_by_type = pd.DataFrame(columns=["type", "total"])
slurs_by_type.loc[0] = "ethnicity", joined_df['contains_ethnicity_slurs'].sum()
slurs_by_type.loc[1] = "nationality", joined_df['contains_nationality_slurs'].sum()
slurs_by_type.loc[2] = "gender", joined_df['contains_gender_slurs'].sum()
slurs_by_type.loc[3] = "sexual_orientation", joined_df['contains_sexual_orientation_slurs'].sum()
slurs_by_type.loc[4] = "class", joined_df['contains_class_slurs'].sum()
slurs_by_type.loc[5] = "religion", joined_df['contains_religion_slurs'].sum()
slurs_by_type.loc[6] = "disability", joined_df['contains_disability_slurs'].sum()
slurs_by_type.loc[7] = "emoji", joined_df['contains_emoji_slurs'].sum()


# Analysis

In [None]:
# Print some headline figures
print(f"Earliest tweet: {joined_df['created_at'].min()}")
print(f"Latest tweet: {joined_df['created_at'].max()}")

print(f"Number of tweets (player-only): {len(joined_df)}")

Earliest tweet: 2021-06-19 08:10:19+00:00
Latest tweet: 2021-07-16 23:59:28+00:00
Number of tweets (player-only): 1046319


In [None]:
# Print emoji stats
print(f"{banana_tweet_count} tweets containing banana emoji. {banana_total} banana emojis used in total")
print(f"{monkey_tweet_count} tweets containing monkey emoji. {monkey_total} monkey emojis used in total")
print(f"{monkey_face_tweet_count} tweets containing monkey face emoji. {monkey_face_total} monkey face emojis used in total")
print(f"{speak_no_evil_monkey_tweet_count} tweets containing speak-no-evil monkey emoji. {speak_no_evil_monkey_total} speak-no-evil monkey emojis used in total")
print(f"{hear_no_evil_monkey_tweet_count} tweets containing hear-no-evil monkey emoji. {hear_no_evil_monkey_total} hear-no-evil monkey emojis used in total")
print(f"{see_no_evil_monkey_tweet_count} tweets containing see-no-evil monkey emoji. {see_no_evil_monkey_total} see-no-evil monkey emojis used in total")
print(f"{gorilla_tweet_count} tweets containing gorilla emoji. {gorilla_total} gorilla emojis used in total")
print(f"{watermelon_tweet_count} tweets containing watermelon emoji. {watermelon_total} watermelon emojis used in total")
print(f"{total_emoji_tweets} tweets containing potentially racist emojis. {total_emoji_count} potentially racist emojis used in total")

In [None]:
player_offensive_tweet_map.sort_values('offensive_tweets_received', axis=0, ascending=False, inplace=True)
# offensive_tweets_all = player_offensive_tweet_map['offensive_tweets_received'].sum()
offensive_tweets_all = joined_df['contains_slurs_or_offensive_emoji'].sum()
offensive_tweets_top_10 = player_offensive_tweet_map['offensive_tweets_received'].head(10).sum()
top_10_proportion = offensive_tweets_top_10 / offensive_tweets_all
print(f"Top 10 proportion: {top_10_proportion * 100}%")

print(player_offensive_tweet_map[['username', 'offensive_tweets_received']].head(10))

Top 10 proportion: 92.8169014084507%
           username offensive_tweets_received
9    MarcusRashford                      1177
22     BukayoSaka87                       845
7             HKane                       498
14       Sanchooo10                       481
8         sterling7                       419
5      JackGrealish                       407
103   RomeluLukaku9                       234
13     OfficialTM_3                       224
4    HarryMaguire93                       173
81    AntoGriezmann                       155


# Visualisation - wordclouds of slurs

In [None]:
# Menthod for creating a wordcloud, and print out the most frequently used slurs
def create_slur_wordcloud(col_name):
    list_of_terms = [a for b in joined_df[col_name][joined_df[col_name] != "[]"].str.replace("'", "").str.replace("[", "").str.replace("]", "").str.split(", ") for a in b]
    wc = WordCloud(background_color="white", collocations=False).generate(" ".join(list_of_terms))
    plt.figure()
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

    slur_df = pd.DataFrame(columns=["term", "count"])
    slur_df['term'] = list_of_terms
    slur_df['count'] = slur_df['count'].fillna(0)
    slur_df = slur_df.groupby(['term']).count()
    slur_df = slur_df.sort_values('count', axis=0, ascending=False)
    print(slur_df.head(20))

In [None]:
create_slur_wordcloud("matching_hatebase_terms_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_ethnicity_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_religion_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_gender_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_sexual_orientation_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_class_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_disability_over_50")

In [None]:
create_slur_wordcloud("matching_hatebase_terms_nationality_over_50")

# Visulisations - tweet frequency

In [21]:
def create_frequency_plot(df, y_values, title, nticks):
    fig = go.Figure()
    for y_value in y_values:
        fig.add_trace(go.Scatter(x=df['created_at'], y=df[y_value],
                      mode='lines',
                      name=y_value))
        
    fig.update_layout(
        xaxis_title="Date",
        yaxis_title="Number of tweets",
        xaxis = {
          'tickformat': '%d %B',
          'tickmode': 'auto',
          'nticks': nticks,
        },
        xaxis_tickformat = '%d %B',
        title= {
          'text': title,
          'y':0.9,
          'x':0.5,
          'xanchor': 'center',
          'yanchor': 'top'
        }
    )
    return fig

In [22]:
INTERVAL = pd.offsets.Minute(60)

In [25]:
# Plot all tweets frequency
df = joined_df.resample(INTERVAL, on='created_at')['tweet_id'].count().reset_index()
fig = create_frequency_plot(df, ['tweet_id'], "Frequency of tweets (aggregated over 60 minute intervals)", 28)
fig.write_html("/content/drive/MyDrive/University/Dissertation/images/figure1.html", include_plotlyjs='cdn')
fig.show()

In [None]:
# Plot Sterling, Rashford, Kane tweet frequency
df = joined_df.resample(INTERVAL, on='created_at')['MarcusRashford', 'sterling7', 'HKane'].sum().reset_index()
fig = create_frequency_plot(df, ['MarcusRashford', 'sterling7', 'HKane'], "Frequency of tweets (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot German players
df = joined_df.resample(INTERVAL, on='created_at')[germany].sum().reset_index()
fig = create_frequency_plot(df, germany, "Frequency of tweets for German players (aggregated over 60 minute intervals)", 28)
fig.show()

# Kroos announced retirement on 2nd July, after they'd been knocked out by England

In [None]:
# Plot Netherlands players
df = joined_df.resample(INTERVAL, on='created_at')[netherlands].sum().reset_index()
fig = create_frequency_plot(df, netherlands, "Frequency of tweets for Dutch players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot England players
df = joined_df.resample(INTERVAL, on='created_at')[england].sum().reset_index()
fig = create_frequency_plot(df, england, "Frequency of tweets for England players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot Scotland players
df = joined_df.resample(INTERVAL, on='created_at')[scotland].sum().reset_index()
fig = create_frequency_plot(df, scotland, "Frequency of tweets for Scottish players (aggregated over 60 minute intervals)", 28)
fig.show()

# 25th June - Tierney signs new contract
# In days following 18th June - Gilmour has Covid
# 2nd July - Gilmour signs for Norwich

In [None]:
# Plot Belgian players
df = joined_df.resample(INTERVAL, on='created_at')[belgium].sum().reset_index()
fig = create_frequency_plot(df, belgium, "Frequency of tweets for Belgian players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot French players
df = joined_df.resample(INTERVAL, on='created_at')[france].sum().reset_index()
fig = create_frequency_plot(df, france, "Frequency of tweets for French players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets
df = joined_df.resample(INTERVAL, on='created_at')['contains_slurs_or_offensive_emoji'].sum().reset_index()
fig = create_frequency_plot(df, ['contains_slurs_or_offensive_emoji'], "Frequency of tweets containing HateBase slurs or emoji slurs (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for England players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in england]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in england], "Frequency of tweets containing HateBase slurs or emoji slurs for England players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for Scotland players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in scotland]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in scotland], "Frequency of tweets containing HateBase slurs or emoji slurs for Scotland players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for Belgium players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in belgium]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in belgium], "Frequency of tweets containing HateBase slurs or emoji slurs for Belgium players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for France players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in france]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in france], "Frequency of tweets containing HateBase slurs or emoji slurs for France players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for Netherlands players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in netherlands]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in netherlands], "Frequency of tweets containing HateBase slurs or emoji slurs for Netherlands players (aggregated over 60 minute intervals)", 28)
fig.show()

In [None]:
# Plot frequency of offensive tweets for Germany players
df = joined_df.resample(INTERVAL, on='created_at')[[x + '_offensive' for x in germany]].sum().reset_index()
fig = create_frequency_plot(df, [x + '_offensive' for x in germany], "Frequency of tweets containing HateBase slurs or emoji slurs for Germany players (aggregated over 60 minute intervals)", 28)
fig.show()

# Visualisations - stack charts

In [26]:
player_offensive_tweet_map.sort_values('offensive_tweets_received', ascending=False, inplace=True)

def create_ethnicity_stack_chart(number):
    df = player_offensive_tweet_map.head(number)
    data0 = go.Bar(
        x = df.username,
        y = df.ethnicity_tweets_received,
        name = 'Ethnicity-related',
        text = df.ethnicity_tweets_received,
        textposition = 'inside',
        texttemplate = '%{text:.2}'
    )

    data1 = go.Bar(
        x = df.username,
        y = df.offensive_tweets_received - df.ethnicity_tweets_received,
        name = 'Other',
        text = df.offensive_tweets_received - df.ethnicity_tweets_received,
        textposition = 'inside',
        texttemplate = '%{text:.2}'
    )

    data = [data0, data1]

    layout = go.Layout(title = "Proportion of tweets containing abusive slurs where slurs related to ethnicity", barmode='stack')

    figure = go.Figure(data = data, layout = layout)
    figure.show()



In [27]:
create_ethnicity_stack_chart(20)

AttributeError: ignored

In [29]:
slurs_by_type.sort_values('total', ascending=False, inplace=True)

data = go.Bar(
    x = slurs_by_type['type'],
    y = slurs_by_type.total,
    text = slurs_by_type.total,
    textposition = 'outside',
    texttemplate = '%{text:.2}'
)

layout = go.Layout(
    title= {
          'text': "Tweets containing slurs by type",
          'y':0.9,
          'x':0.5,
          'xanchor': 'center',
          'yanchor': 'top'
        },
    yaxis_title="Number of tweets",
    )

figure = go.Figure(data = data, layout = layout)
figure.write_html("/content/drive/MyDrive/University/Dissertation/images/figure3.html", include_plotlyjs='cdn')
figure.show()