<a href="https://colab.research.google.com/github/csralvall/online_game_toxicity/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import util functions

In [1]:
from IPython.display import clear_output

### Install dependencies

In [2]:
!pip install -U pip setuptools wheel pandas spacy fasttext-langdetect wget
clear_output()

### Import libraries

In [3]:
from google.colab import drive
import pandas as pd
import spacy
import re
from spacy.tokens import Token
from spacy.language import Language
from ftlangdetect import detect
from joblib import Parallel, delayed

### Mount storage

In [4]:
# mount google drive unit to save computationally expensive results
drive.mount('/content/drive')

Mounted at /content/drive


### Load dataset from disk to memory

In [None]:
# read dataset and create Pandas DataFrame for it
df = pd.read_csv('/content/drive/MyDrive/nlp/dota2_chat_messages.csv')

### Anotate language for each row

- Some failed attempts:
> The usage of langdetect to detect the language of each chat required at least 12hs.

In [None]:
# This cell was created when 'check_lang' used the library langdetect to detect
# the language. Langdetect was very slow and required workarounds to avoid the
# lose of data when google colab decided to shutdown the instance.
# The following code creates a fixed number of disjoint intervals in the range of
# the dataset, with a preset step to adjust the interval size.
step = df.shape[0]//200
print(f'step: {step}')
def get_stop(start, step, len):
    stop = start + step - 1
    if stop > len:
        stop = len
    return stop
ranges = [(start, get_stop(start, step, df.shape[0])) for start in range(0, df.shape[0], step)]

In [None]:
# Code cell created to process DataFrame in chunks with langdetect
# because it was very slow (~12hs to process whole DataFrame)
pd.options.mode.chained_assignment = None
for (idx, (start, stop)) in enumerate(ranges):
    sub_df = df.iloc[start:stop]
    # IMPORTANT: haven't run this but discovered about joblib after doing the task
    # with dask. Apparently is faster than dask since there is no graph overhead as in dask
    # so from a few tests it seems that it might be faster than dask for about 2 hours.
    languages = Parallel(n_jobs=8, verbose=11, backend='multiprocessing', prefer="processes")(
        delayed(check_lang)(sub_df.loc[i, "text"]) for i in range(start, stop))
    sub_df["language"] = languages
    sub_df.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv', index=False)

--------------------------
#### Successful case:
> Using fasttext
--------------------------

In [None]:
# create auxiliary function to detect language used in chat message
# this function uses a wrapper around the Fasttext model
def check_lang(text):
    """Will return the language corresponding to the
    input text"""
    try:
        lang = detect(text, low_memory=False)['lang']
    except:
        lang = "nal"

    return lang

In [None]:
# create new column with the detected language
# use joblib Parallel function to paralelize detection
languages = Parallel(n_jobs=8, verbose=11, backend='multiprocessing', prefer="processes")(
    delayed(check_lang)(df.loc[i, "text"]) for i in range(0, df.shape[0]))
df["language"] = languages
df.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv', index=False)
clear_output()

#### Load dataset with annotated languages from storage

In [5]:
# get processed dataframe with languages anotated
processed_lang = '/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv'
df_lang = pd.read_csv(processed_lang)

In [6]:
# fill null values
df_lang = df_lang.fillna("")

In [7]:
# take only english chats
df_nlp = df_lang.loc[df_lang["language"] == "en", :].reset_index(drop=True).copy()

### Get bad word list from memory

In [8]:
# get downloaded bad word list
word_list = "/content/drive/MyDrive/nlp/bad_words.txt"
# use set for fast queries
bad_words = set(line.strip() for line in open(word_list, 'r'))
# add new bad words
bad_words.update(['noob', 'noobs', 'stfu', 'fukign', 'fuking', 'fukin', 'nooob'])
bad_dict = dict.fromkeys(bad_words, 0)

### Cleaner function:

In [9]:
# clean text from non alphanumeric text to use spacy over clean text
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[!A-Za-z0-9\-]{3,300}")
    df['clean'] = df['text'].str.findall(pattern).str.join(' ')
    return df

In [10]:
df_nlp = cleaner(df_nlp)

### Clean strings and extract features

In [11]:
# download spacy model for english language
!python -m spacy download en_core_web_sm
clear_output()

In [12]:
@Language.component("exclamation_flag")
def is_exclamation(doc):
    '''
        custom component to set flag if token is exclamation sign
    '''
    exclamation_signs = {token.lemma_: (token.lemma_ == '!') for token in doc}
    is_exclamation = lambda x: exclamation_signs[x.lemma_]
    Token.set_extension("is_exclamation", getter = is_exclamation, force=True)
    return doc

In [13]:
# removing stop words and unused tokens
nlp = spacy.load('en_core_web_sm', disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner'])

# add custom component to pipeline
nlp.add_pipe("exclamation_flag", name="exclamation", last=True)

def process(doc):
    tokens = list()

    for token in doc:
        if not token.is_stop and token.is_alpha and len(token) >= 3:
            tke = token.text.lower().strip()
            tke = re.sub(r'[^a-z0-9\s]', '', tke)
            tokens.append(tke)

    return " ".join(tokens)

def get_bad_words_score(doc):
    bad_words_score = 0
    for token in doc:
        if token.text in bad_words:
            bad_words_score += 1
    
    return bad_words_score

def get_intensity_score(doc):
    intensity_score = 0
    for token in doc:
        if token.is_upper:
            intensity_score += 1
        if token._.is_exclamation:
            intensity_score += 1

    return intensity_score

# utility functions to paralellize dataprocessing
def chunker(iterable, total_length, chunksize):
    " Return a generator of chunks from iterable"
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts, function=process):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=100):
        preproc_pipe.append(function(doc))
    return preproc_pipe

def preprocess_parallel(texts, processor=process_chunk, chunksize=100):
    executor = Parallel(n_jobs=8, backend='multiprocessing', prefer="processes")
    do = delayed(processor)
    tasks = (do(chunk) for chunk in chunker(texts, len(df_nlp), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [14]:
def wrapper_intensity(text):
    return process_chunk(text, get_intensity_score)

def wrapper_bad_words(text):
    return process_chunk(text, get_bad_words_score)

In [15]:
# create new column in DataFrame with tokenized words from chat
df_nlp['tokens'] = preprocess_parallel(df_nlp['clean'], chunksize=1000)
clear_output()

In [16]:
# create new column in DataFrame with the intensity score
df_nlp['intensity'] = preprocess_parallel(df_nlp['clean'], processor=wrapper_intensity, chunksize=1000)
clear_output()

In [17]:
# create new column in DataFrame with the toxicity score
df_nlp['toxicity'] = preprocess_parallel(df_nlp['clean'], processor=wrapper_bad_words, chunksize=1000)
clear_output()

In [18]:
# save anotated DataFrame in memory
df_nlp.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv', index=False)