<a href="https://colab.research.google.com/github/dbckz/dissertation/blob/master/notebooks/create_hatebase_regression_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import pandas as pd
import ast
import os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tqdm import tqdm
from google.colab import drive
import plotly.graph_objects as go

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Set up paths
root_path = "/content/drive/MyDrive/University/Dissertation/data_collection"
graph_path = root_path + "/graphs"

day_paths = day_paths = [
        "/01",
        "/02",
        "/03",
        "/04",
        "/05",
        "/06",
        "/07",
        "/08",
        "/09",
        "/10",
        "/11",
        "/12",
        "/13",
        "/14",
        "/15",
        "/16",
        "/17",
        "/18",
        "/19",
        "/20",
        "/21",
        "/22",
        "/23",
        "/24",
        "/25",
        "/26",
        "/27",
        "/28",
        "/29",
        "/30",
        "/31",
        "/32",
        "/33",
        "/34",
        "/35",
        "/36"
    ]

In [4]:
# Create directory to store visualisations
try:
    os.mkdir(graph_path)
except OSError as error:
    print(error)

[Errno 17] File exists: '/content/drive/MyDrive/University/Dissertation/data_collection/graphs'


In [5]:
# Load data
threshold = 90

in_tweets = pd.DataFrame()
hb_guard = pd.DataFrame()
for path in day_paths:
    directory = root_path + path
    tweets_csv = directory + "/tweets.csv"
    matched_terms_csv = directory + "/hatebase_processed_tweets.csv"

    print(f"Loading CSVs for directory {path}...")
    in_tweets = pd.concat([in_tweets, 
                           pd.read_csv(tweets_csv,
                                       usecols = [
                                                  'created_at',
                                                  'tweet_id',
                                                  'tweet_text',
                                                  'accounts_mentioned'
                                       ],
                                       dtype = {
                                          # 'created_at':
                                          'tweet_id': np.int64,
                                          'tweet_text': str,
                                          'accounts_mentioned': object
                                       },
                                       parse_dates=['created_at'])])

    hb_guard = pd.concat([hb_guard, pd.read_csv(matched_terms_csv,
                                                usecols = [
                                                           'tweet_id',
                                                           f'matching_hatebase_terms_over_{threshold}'
                                                ],
                                                dtype = {
                                                    'tweet_id': np.int64,
                                                    f'matching_hatebase_terms_over_{threshold}': str
                                                })])

# Dedup
original_tweets_length = len(in_tweets)
original_hatebase_length = len(hb_guard)
in_tweets.drop_duplicates(subset=['tweet_id'], inplace=True)
hb_guard.drop_duplicates(subset=['tweet_id'], inplace=True)
print(f"Size of tweets dataframe: {len(in_tweets)}, having dropped {original_tweets_length - len(in_tweets)} duplicate rows")
print(f"Size of hatebase dataframe: {len(hb_guard)}, having dropped {original_hatebase_length - len(hb_guard)} duplicate rows")


Loading CSVs for directory /01...
Loading CSVs for directory /02...
Loading CSVs for directory /03...
Loading CSVs for directory /04...
Loading CSVs for directory /05...
Loading CSVs for directory /06...
Loading CSVs for directory /07...
Loading CSVs for directory /08...
Loading CSVs for directory /09...
Loading CSVs for directory /10...
Loading CSVs for directory /11...
Loading CSVs for directory /12...
Loading CSVs for directory /13...
Loading CSVs for directory /14...
Loading CSVs for directory /15...
Loading CSVs for directory /16...
Loading CSVs for directory /17...
Loading CSVs for directory /18...
Loading CSVs for directory /19...
Loading CSVs for directory /20...
Loading CSVs for directory /21...
Loading CSVs for directory /22...
Loading CSVs for directory /23...
Loading CSVs for directory /24...
Loading CSVs for directory /25...
Loading CSVs for directory /26...
Loading CSVs for directory /27...
Loading CSVs for directory /28...
Loading CSVs for directory /29...
Loading CSVs f

In [6]:
# Up the pandas display limits so printed dataframes aren't so truncated
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_info_rows', 100)
pd.set_option('display.max_info_columns', 100)

# Data manipulation

In [7]:
# Join tables + drop old ones!
joined_df = pd.merge(in_tweets, hb_guard, how='outer', on='tweet_id')
del hb_guard
del in_tweets

In [8]:
joined_df = joined_df[(joined_df['created_at'] > '2021-06-19 08:10:18+00:00') & (joined_df['created_at'] < '2021-07-17 00:00:00+00:00')]
len(joined_df)

1274885

In [9]:
# Create a column indicating whether tweet contains slurs (hacky > 2 as empty list is stored as string "[]")
joined_df['contains_slurs'] = joined_df[f'matching_hatebase_terms_over_{threshold}'].str.len() > 2


In [10]:
# Extract players
england = ["JPickford1", "kylewalker2", "LukeShaw23", "_DeclanRice", "HarryMaguire93", "JackGrealish",
                    "JHenderson", "HKane", "sterling7", "MarcusRashford", "trippier2", "deanhenderson",
                    "Kalvinphillips", "OfficialTM_3", "Sanchooo10", "CalvertLewin14", "masonmount_10", "PhilFoden",
                    "BenChilwell", "ben6white", "samjohnstone50", "reecejames_24", "BukayoSaka87", "BellinghamJude"]

netherlands = ["joel_veltman", "mdeligt_04", "NathanAke", "Stefandevrij", "GWijnaldum", "LuukdeJong9", "Memphis", "QPromes", "pvanaanholt", "TimKrul", "DavyKlaassen", "Dirono", "RGravenberch", "BlindDaley", "DeJongFrenkie21", "DenzelJMD2"]

germany = ["Manuel_Neuer", "ToniRuediger", "MatzeGinter", "matshummels", "kaihavertz29", "ToniKroos", "KeVolland", "SergeGnabry", "Bernd_Leno", "JamalMusiala", "lukaskl96", "leongoretzka_", "leroy_sane", "IlkayGuendogan", "emrecan_", "RobinKoch25", "esmuellert_"]

scotland = ["MarshallDavid23", "sodonnell15", "andrewrobertso5", "mctominay10", "granthanley5", "kierantierney1", "jmcginn7", "Callummcgregor8", "Lyndon_Dykes", "CheAdams_", "CraigGordon01", "declang31", "LiamCooper__", "10DavidTurnbull", "kevinnisbet16", "np4tterson", "billygilmourrr", "Jack_Hendry2", "Scottmckenna3"]

france = ["BenPavard28", "kimpembe_3", "raphaelvarane", "clement_lenglet", "paulpogba", "AntoGriezmann", "_OlivierGiroud_", "KMbappe", "CorentinTolisso", "nglkante", "KurtZouma", "SteveMandanda", "MoussaSissoko", "LucasDigne", "Benzema", "LucasHernandez", "WissBenYedder", "mmseize", "leodubois15", "jkeey4", "MarcusThuram"]

belgium = ["thibautcourtois", "AlderweireldTob", "thomasvermaelen", "JanVertonghen", "axelwitsel28", "DeBruyneKev", "RomeluLukaku9", "hazardeden10", "CarrascoY21", "SMignolet", "dries_mertens14", "ThomMills", "HazardThorgan8", "VanakenHans", "Jasondenayer", "chrisbenteke", "NChadli", "mbatshuayi", "LTrossard", "JeremyDoku", "dennispraet"]

list_of_players = england + netherlands + germany + scotland + france + belgium

for player in list_of_players:
    print(f"Extracting {player}...")
    joined_df[player] = joined_df['accounts_mentioned'].str.contains(f"'username': '{player}'").astype(np.bool)

# player_tweet_map = pd.DataFrame(columns=["username", "tweets_received"])

# i = 0
# for player in list_of_players:
#     tweets = joined_df[player].sum()
#     player_tweet_map.loc[i] = player, tweets
#     i += 1

# player_tweet_map.sort_values('tweets_received', axis=0, ascending=False, inplace=True)



Extracting JPickford1...
Extracting kylewalker2...
Extracting LukeShaw23...
Extracting _DeclanRice...
Extracting HarryMaguire93...
Extracting JackGrealish...
Extracting JHenderson...
Extracting HKane...
Extracting sterling7...
Extracting MarcusRashford...
Extracting trippier2...
Extracting deanhenderson...
Extracting Kalvinphillips...
Extracting OfficialTM_3...
Extracting Sanchooo10...
Extracting CalvertLewin14...
Extracting masonmount_10...
Extracting PhilFoden...
Extracting BenChilwell...
Extracting ben6white...
Extracting samjohnstone50...
Extracting reecejames_24...
Extracting BukayoSaka87...
Extracting BellinghamJude...
Extracting joel_veltman...
Extracting mdeligt_04...
Extracting NathanAke...
Extracting Stefandevrij...
Extracting GWijnaldum...
Extracting LuukdeJong9...
Extracting Memphis...
Extracting QPromes...
Extracting pvanaanholt...
Extracting TimKrul...
Extracting DavyKlaassen...
Extracting Dirono...
Extracting RGravenberch...
Extracting BlindDaley...
Extracting DeJongFren

In [11]:
# Sort by ascending date
joined_df.sort_values('created_at', axis=0, inplace=True)
# joined_df['created_at'] = pd.to_datetime(joined_df['created_at'])

In [12]:
joined_df = joined_df[
    (joined_df["JPickford1"]) |
    (joined_df["kylewalker2"]) |
    (joined_df["LukeShaw23"]) |
    (joined_df["kylewalker2"]) |
    (joined_df["_DeclanRice"]) |
    (joined_df["HarryMaguire93"]) |
    (joined_df["JackGrealish"]) |
    (joined_df["JHenderson"]) |
    (joined_df["HKane"]) |
    (joined_df["sterling7"]) |
    (joined_df["MarcusRashford"]) |
    (joined_df["trippier2"]) |
    (joined_df["deanhenderson"]) |
    (joined_df["Kalvinphillips"]) |
    (joined_df["OfficialTM_3"]) |
    (joined_df["Sanchooo10"]) |
    (joined_df["CalvertLewin14"]) |
    (joined_df["masonmount_10"]) |
    (joined_df["PhilFoden"]) |
    (joined_df["BenChilwell"]) |
    (joined_df["ben6white"]) |
    (joined_df["samjohnstone50"]) |
    (joined_df["reecejames_24"]) |
    (joined_df["BukayoSaka87"]) |
    (joined_df["BellinghamJude"]) |
    (joined_df["joel_veltman"]) |
    (joined_df["mdeligt_04"]) |
    (joined_df["LukeShaw23"]) |
    (joined_df["NathanAke"]) |
    (joined_df["GWijnaldum"]) |
    (joined_df["LuukdeJong9"]) |
    (joined_df["Memphis"]) |
    (joined_df["QPromes"]) |
    (joined_df["pvanaanholt"]) |
    (joined_df["TimKrul"]) |
    (joined_df["DavyKlaassen"]) |
    (joined_df["Dirono"]) |
    (joined_df["RGravenberch"]) |
    (joined_df["BlindDaley"]) |
    (joined_df["DeJongFrenkie21"]) |
    (joined_df["DenzelJMD2"]) |
    (joined_df["Manuel_Neuer"]) |
    (joined_df["ToniRuediger"]) |
    (joined_df["MatzeGinter"]) |
    (joined_df["matshummels"]) |
    (joined_df["kaihavertz29"]) |
    (joined_df["ToniKroos"]) |
    (joined_df["KeVolland"]) |
    (joined_df["SergeGnabry"]) |
    (joined_df["Bernd_Leno"]) |
    (joined_df["JamalMusiala"]) |
    (joined_df["lukaskl96"]) |
    (joined_df["leongoretzka_"]) |
    (joined_df["leroy_sane"]) |
    (joined_df["IlkayGuendogan"]) |
    (joined_df["emrecan_"]) |
    (joined_df["RobinKoch25"]) |
    (joined_df["esmuellert_"]) |
    (joined_df["MarshallDavid23"]) |
    (joined_df["sodonnell15"]) |
    (joined_df["andrewrobertso5"]) |
    (joined_df["mctominay10"]) |
    (joined_df["granthanley5"]) |
    (joined_df["kierantierney1"]) |
    (joined_df["jmcginn7"]) |
    (joined_df["Callummcgregor8"]) |
    (joined_df["Lyndon_Dykes"]) |
    (joined_df["CheAdams_"]) |
    (joined_df["CraigGordon01"]) |
    (joined_df["declang31"]) |
    (joined_df["LiamCooper__"]) |
    (joined_df["10DavidTurnbull"]) |
    (joined_df["kevinnisbet16"]) |
    (joined_df["np4tterson"]) |
    (joined_df["billygilmourrr"]) |
    (joined_df["Jack_Hendry2"]) |
    (joined_df["Scottmckenna3"]) |
    (joined_df["BenPavard28"]) |
    (joined_df["kimpembe_3"]) |
    (joined_df["raphaelvarane"]) |
    (joined_df["clement_lenglet"]) |
    (joined_df["paulpogba"]) |
    (joined_df["AntoGriezmann"]) |
    (joined_df["_OlivierGiroud_"]) |
    (joined_df["KMbappe"]) |
    (joined_df["CorentinTolisso"]) |
    (joined_df["nglkante"]) |
    (joined_df["KurtZouma"]) |
    (joined_df["SteveMandanda"]) |
    (joined_df["MoussaSissoko"]) |
    (joined_df["LucasDigne"]) |
    (joined_df["Benzema"]) |
    (joined_df["LucasHernandez"]) |
    (joined_df["WissBenYedder"]) |
    (joined_df["mmseize"]) |
    (joined_df["leodubois15"]) |
    (joined_df["jkeey4"]) |
    (joined_df["ben6white"]) |
    (joined_df["MarcusThuram"]) |
    (joined_df["thibautcourtois"]) |
    (joined_df["AlderweireldTob"]) |
    (joined_df["thomasvermaelen"]) |
    (joined_df["JanVertonghen"]) |
    (joined_df["axelwitsel28"]) |
    (joined_df["DeBruyneKev"]) |
    (joined_df["RomeluLukaku9"]) |
    (joined_df["hazardeden10"]) |
    (joined_df["CarrascoY21"]) |
    (joined_df["SMignolet"]) |
    (joined_df["dries_mertens14"]) |
    (joined_df["ThomMills"]) |
    (joined_df["HazardThorgan8"]) |
    (joined_df["VanakenHans"]) |
    (joined_df["Jasondenayer"]) |
    (joined_df["chrisbenteke"]) |
    (joined_df["NChadli"]) |
    (joined_df["mbatshuayi"]) |
    (joined_df["LTrossard"]) |
    (joined_df["JeremyDoku"]) |
    (joined_df["dennispraet"])
    ]

In [13]:
len(joined_df)

1046319

In [14]:
# Maybe we don't need these cols and we can just calculate ad-hoc?
for player in list_of_players:
    joined_df[f'{player}_offensive'] = joined_df['contains_slurs'] & joined_df[player]

In [15]:
tweets_regression_file = root_path + "/regression_tweets_hb.csv"
joined_df.to_csv(tweets_regression_file, index=False)