<a href="https://colab.research.google.com/github/csralvall/online_game_toxicity/blob/main/nlp_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prelude
- Install dependencies.
- Import libraries.

In [1]:
from IPython.display import clear_output

In [2]:
# install dependencies
!pip install -U kaggle pip setuptools wheel pandas sklearn numpy spacy nltk gensim fasttext-langdetect wget tqdm mr4mp
clear_output()

In [4]:
# import libraries
from google.colab import drive, files
import os
import io
import pandas as pd
import spacy
import numpy as np
import pickle
import mr4mp
import re
from spacy.tokens import Token
from spacy.language import Language
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from ftlangdetect import detect
from joblib import Parallel, delayed
from google.colab import files
from functools import reduce
from timeit import default_timer
from collections import Counter
from gensim.models import Word2Vec

- Mount storage.
- Load API keys.
- Download datasets and store them locally.

In [5]:
# mount google drive unit to save computationally expensive results
drive.mount('/content/drive')

Mounted at /content/drive


#### - Only run the following to recreate project from zero

In [None]:
uploaded = files.upload()

# load Kaggle API keys
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
# then move kaggle.json into the folder where the API expects to find it
# if it is not already present
![[ ! -d "~/.kaggle" ]] && mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download dataset if not saved in drive
![[ ! -f "/content/gosuai-dota-2-game-chats.zip" ]] && kaggle datasets download -d romovpa/gosuai-dota-2-game-chats

In [None]:
# unzip file if it is not already inflated
![[ ! -f "/content/dota2_chat_messages.csv" ]] && unzip gosuai-dota-2-game-chats.zip

In [None]:
# make drive directory if it is not already created
%env drive_dir=/content/drive/MyDrive/nlp
![[ ! -d $drive_dir ]] && mkdir -p $drive_dir

In [None]:
# move downloaded dataset to drive folder if it is not already there
%env drive_file=/content/drive/MyDrive/nlp/dota2_chat_messages.csv
![[ ! -f $drive_file ]] && mv dota2_chat_messages.csv $drive_file

In [None]:
# check google drive folder status
!ls /content/drive/MyDrive/nlp

# Preprocessing

In [None]:
# read dataset and create Pandas DataFrame for it
df = pd.read_csv('/content/drive/MyDrive/nlp/dota2_chat_messages.csv')

### Anotate language for each row

In [None]:
# create auxiliary function to detect language used in chat message
# this function uses a wrapper around the Fasttext model
def check_lang(text):
    """Will return the language corresponding to the
    input text"""
    try:
        lang = detect(text, low_memory=False)['lang']
    except:
        lang = "nal"

    return lang

In [None]:
# create new column with the detected language
# use joblib Parallel function to paralelize detection
languages = Parallel(n_jobs=8, verbose=11, backend='multiprocessing', prefer="processes")(
    delayed(check_lang)(df.loc[i, "text"]) for i in range(0, df.shape[0]))
df["language"] = languages
df.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv', index=False)
clear_output()

In [6]:
# get processed dataframe with languages anotated
processed_lang = '/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv'
df_lang = pd.read_csv(processed_lang)

In [7]:
# fill null values
df_lang = df_lang.fillna("")

In [8]:
# take only english chats
df_nlp = df_lang.loc[df_lang["language"] == "en", :].reset_index(drop=True).copy()

### Download list of bad words in english (lexicon)

In [None]:
# get a bad-word list
![[ ! -f "/content/drive/MyDrive/nlp/bad_words.txt" ]] && wget -O bad_words.txt https://www.cs.cmu.edu/~biglou/resources/bad-words.txt
# copy file to drive
![[ ! -f "/content/drive/MyDrive/nlp/bad_words.txt" ]] && mv /content/bad_words.txt /content/drive/MyDrive/nlp/

### Get bad word list from memory

In [None]:
# get downloaded bad word list
word_list = "/content/drive/MyDrive/nlp/bad_words.txt"
# use set for fast queries
bad_words = set(line.strip() for line in open(word_list, 'r'))
# add new bad words
bad_words.update(['noob', 'noobs', 'stfu', 'fukign', 'fuking', 'fukin', 'nooob'])
bad_dict = dict.fromkeys(bad_words, 0)

### Cleaner function:

In [None]:
# clean text from non alphanumeric text to use spacy over clean text
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[!A-Za-z0-9\-]{3,300}")
    df['clean'] = df['text'].str.findall(pattern).str.join(' ')
    return df

In [None]:
df_nlp = cleaner(df_nlp)

### Clean strings and extract features

- Some failed attempts:
> The usage of langdetect to detect the language of each chat required at least 12hs.

In [None]:
# This cell was created when 'check_lang' used the library langdetect to detect
# the language. Langdetect was very slow and required workarounds to avoid the
# lose of data when google colab decided to shutdown the instance.
# The following code creates a fixed number of disjoint intervals in the range of
# the dataset, with a preset step to adjust the interval size.
step = df.shape[0]//200
print(f'step: {step}')
def get_stop(start, step, len):
    stop = start + step - 1
    if stop > len:
        stop = len
    return stop
ranges = [(start, get_stop(start, step, df.shape[0])) for start in range(0, df.shape[0], step)]

In [None]:
# Code cell created to process DataFrame in chunks with langdetect
# because it was very slow (~12hs to process whole DataFrame)
pd.options.mode.chained_assignment = None
for (idx, (start, stop)) in enumerate(ranges):
    sub_df = df.iloc[start:stop]
    # IMPORTANT: haven't run this but discovered about joblib after doing the task
    # with dask. Apparently is faster than dask since there is no graph overhead as in dask
    # so from a few tests it seems that it might be faster than dask for about 2 hours.
    languages = Parallel(n_jobs=8, verbose=11, backend='multiprocessing', prefer="processes")(
        delayed(check_lang)(sub_df.loc[i, "text"]) for i in range(start, stop))
    sub_df["language"] = languages
    sub_df.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_messages_lang.csv', index=False)

In [None]:
# download spacy model for english language
!python -m spacy download en_core_web_sm
clear_output()

In [None]:
@Language.component("exclamation_flag")
def is_exclamation(doc):
    '''
        custom component to set flag if token is exclamation sign
    '''
    exclamation_signs = {token.lemma_: (token.lemma_ == '!') for token in doc}
    is_exclamation = lambda x: exclamation_signs[x.lemma_]
    Token.set_extension("is_exclamation", getter = is_exclamation, force=True)
    return doc

In [None]:
# removing stop words and unused tokens
# use only lemmatizer to get 
nlp = spacy.load('en_core_web_sm', disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner'])

# add custom component to pipeline
nlp.add_pipe("exclamation_flag", name="exclamation", last=True)

def tokenize(doc):
    tokens = list()

    for token in doc:
        if not token.is_stop and token.is_alpha and len(token) >= 3:
            tke = token.text.lower().strip()
            tke = re.sub(r'[^a-z0-9\s]', '', tke)
            tokens.append(tke)

    return " ".join(tokens)

def get_bad_words_score(doc):
    bad_words_score = 0
    for token in doc:
        if token.text in bad_words:
            bad_words_score += 1
    
    return bad_words_score

def get_intensity_score(doc):
    intensity_score = 0
    for token in doc:
        if token.is_upper:
            intensity_score += 1
        if token._.is_exclamation:
            intensity_score += 1

    return intensity_score

# utility functions to paralellize dataprocessing
def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts, function=tokenize):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=100):
        preproc_pipe.append(function(doc))
    return preproc_pipe

def preprocess_parallel(texts, processor=process_chunk, chunksize=100):
    executor = Parallel(n_jobs=8, backend='multiprocessing', prefer="processes")
    do = delayed(processor)
    tasks = (do(chunk) for chunk in chunker(texts, len(df_nlp), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [None]:
def wrapper_intensity(text):
    return process_chunk(text, get_intensity_score)

def wrapper_bad_words(text):
    return process_chunk(text, get_bad_words_score)

In [None]:
# create new column in DataFrame with tokenized words from chat
df_nlp['tokens'] = preprocess_parallel(df_nlp['clean'], chunksize=1000)



In [None]:
# create new column in DataFrame with the intensity score
df_nlp['intensity'] = preprocess_parallel(df_nlp['clean'], processor=wrapper_intensity, chunksize=1000)



In [None]:
# create new column in DataFrame with the toxicity score
df_nlp['toxicity'] = preprocess_parallel(df_nlp['clean'], processor=wrapper_bad_words, chunksize=1000)



In [None]:
# save anotated DataFrame in memory
df_nlp.to_csv(f'/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv', index=False)

In [9]:
# english chats from original dataset with anotations
eng_annotated = '/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv'
df_eng = pd.read_csv(eng_annotated)
df_test = df_eng[:10000]

In [10]:
df_eng.head()

Unnamed: 0,match,time,slot,text,language,clean,tokens,intensity,toxicity
0,0,1808.40822,9,100%,en,100,,0,0
1,1,-131.14018,0,twitch.tv/rage_channel,en,twitch rage channel,twitch rage channel,0,0
2,1,-121.60481,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0
3,1,700.72893,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0
4,1,702.99503,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0


In [11]:
df_eng.tail()

Unnamed: 0,match,time,slot,text,language,clean,tokens,intensity,toxicity
6921683,999998,917.21927,8,damn you!!!!,en,damn you!!!!,damn,4,1
6921684,999998,1709.49237,6,baited,en,baited,baited,0,0
6921685,999998,1765.54537,7,lmao,en,lmao,lmao,0,0
6921686,999999,974.04976,0,sec please,en,sec please,sec,0,0
6921687,999999,2674.38856,3,ggwp lol,en,ggwp lol,ggwp lol,0,0


### Create Bag of Words (BOW)

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [None]:
# from cleaned english chats get all of them without nan values
chats = df_test[['tokens']].dropna().astype(str).values

In [None]:
chats = flatten(chats)

In [None]:
# function to transform chats in sets of words
def chat_to_set(chat: [str]) -> {str}:
    return set(chat.split())

# function to join all chat sets in one big set
def join_chat_sets(chat: {str},bag: {str}) -> {str}:
    return bag.union(chat)

In [None]:
# use map reduce model to create the Bag of Words (BOW)
start = default_timer()
pool = mr4mp.pool(10) # roughly 1hs with gpu with full eng dataset
set_of_words = pool.mapreduce(chat_to_set, join_chat_sets, chats)
pool.close()
bag_of_words = dict.fromkeys(set_of_words, 0)
print("Finished in " + str(default_timer()-start) + "s using " + str(len(pool)) + " process(es).")

Finished in 0.45472574099994745s using 10 process(es).


In [None]:
# save bag of words in drive (very expensive to compute)
# use when running code with full dataset
with open('/content/drive/MyDrive/nlp/bag_of_words.pkl', 'wb') as dict_file:
    pickle.dump(bag_of_words, dict_file)
    dict_file.close()

# Some experiments with bigrams

In [None]:
from nltk import word_tokenize 
from nltk.util import ngrams
import nltk
nltk.download('punkt')

bigrams = []
trigrams = []
for line in chats:
    token = line.split()
    bigrams.append(list(map(lambda x: '_'.join(x), ngrams(token, 2))))
    trigrams.append(list(map(lambda x: '_'.join(x), ngrams(token, 3))))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
len(trigrams)

8157

In [None]:
from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=5,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

bigrams = build_phrases(chats)

In [None]:
bigrams = list(map(lambda x: [x], bigrams[chats]))

In [None]:
w2v_bi = generate_embedding(bigrams)

In [None]:
w2v_bi.wv.most_similar('report')

[('nice', 0.2529045641422272),
 ('carry', 0.20082908868789673),
 ('ggwp', 0.17018888890743256),
 ('team', 0.15016482770442963),
 ('mid', 0.13887985050678253),
 ('feed', 0.10852647572755814),
 ('guys', 0.09936434030532837),
 ('commend', 0.03476494550704956),
 ('def', 0.0330718494951725),
 ('win', 0.019886532798409462)]

# Generate word embeddings

In [None]:
# function to create embeddings of words in each chat
def generate_embedding(sentences: [[str]]) -> ([str], np.ndarray):
  w2v_model = Word2Vec(
                     min_count=20,
                     window=2,
                     #size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=1)

  w2v_model.build_vocab(sentences, progress_per=10000)

  w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

  return w2v_model

In [None]:
vocab = list(map(lambda x: x.split(), chats))

In [None]:
w2v_model = generate_embedding(vocab)

# Create clusters

In [None]:
bow_serie = []
for idx, chat in enumerate(chats):
    bow = dict.fromkeys(set_of_words, 0)
    bad_dict = dict.fromkeys(bad_words, 0)
    word_list = chat.split()
    prod = np.ones(100)
    for word in word_list:
        if word in w2v_model.wv:
            prod *= w2v_model.wv[word] 
        if word in bad_dict:
            bad_dict[word] += 1
        if word in bow:
            bow[word] += 1
    bad_array = np.fromiter(bad_dict.values(), dtype=int)
    bag_array = np.fromiter(bow.values(), dtype=int)
    chat_embed = np.concatenate((prod, bad_array))
    chat_array = np.concatenate((bad_array, bag_array))
    np.append(chat_array, df_test.loc[idx, ['intensity']])
    np.append(chat_embed, df_test.loc[idx, ['intensity']])
    if idx == 0:
        bow_serie = chat_array
        embd_serie = chat_embed
    else:
        bow_serie = np.concatenate((bow_serie, chat_array))
        embd_serie = np.concatenate((embd_serie, chat_embed))


In [None]:
bow_length = len(set_of_words) + len(bad_words)
bow_serie = bow_serie.reshape((bow_length, -1))

In [None]:
embd_length = 100 + len(bad_words)
embd_serie = embd_serie.reshape((embd_length, -1))

In [None]:
bow_matrix = bow_serie[~np.isnan(bow_serie)].reshape((bow_length, -1))
embd_matrix = embd_serie[~np.isnan(embd_serie)].reshape((embd_length, -1))

In [None]:
def reduce_matrix(matrix: np.ndarray, *, variance_treshold: float):
    print(f'INPUT SHAPE: {matrix.shape}')
    # reduce all vectors to [0, 1] space
    normalized_matrix = normalize(matrix, axis=1)
    # compute variances in each row
    matrix_variances = np.var(matrix, axis=0)
    # create mask for features with high correlation (low variance)
    bool_mask = np.where(matrix_variances < variance_treshold)
    # filter features with high correlation (variance under treshold)
    raked_matrix = np.delete(normalized_matrix, bool_mask, axis=1)
    print(f'OUTPUT SHAPE: {raked_matrix.shape}')
    return raked_matrix

In [None]:
bow_reduced = reduce_matrix(bow_matrix, variance_treshold=0.0001)
embd_reduced = reduce_matrix(embd_matrix, variance_treshold=0.0215)

INPUT SHAPE: (5227, 8157)
OUTPUT SHAPE: (5227, 6866)
INPUT SHAPE: (1491, 8157)
OUTPUT SHAPE: (1491, 7964)


In [None]:
def generate_clusters(
    matrix: np.ndarray,
    n_clusters: int
) -> KMeans:
    # generate word clusters using the KMeans algorithm.
    print("\nClustering started")
    # Instantiate KMeans clusterer for n_clusters
    km_model = KMeans(n_clusters=n_clusters, random_state=3)
    # create clusters
    km_model.fit(matrix)
    print("Clustering finished")
    return km_model

In [None]:
bow_clusters = generate_clusters(bow_serie, 50)
embd_clusters = generate_clusters(embd_serie, 50)


Clustering started
Clustering finished

Clustering started
Clustering finished


In [None]:
def display_summary(clusters: KMeans):
    cluster_count = Counter(sorted(clusters.labels_))
    for cluster in cluster_count:
        print ("Cluster#", cluster," - Total words:", cluster_count[cluster])

In [None]:
# show number of words captured by each cluster
display_summary(embd_clusters)

Cluster# 0  - Total words: 16
Cluster# 1  - Total words: 33
Cluster# 2  - Total words: 382
Cluster# 3  - Total words: 13
Cluster# 4  - Total words: 23
Cluster# 5  - Total words: 22
Cluster# 6  - Total words: 29
Cluster# 7  - Total words: 24
Cluster# 8  - Total words: 20
Cluster# 9  - Total words: 14
Cluster# 10  - Total words: 38
Cluster# 11  - Total words: 19
Cluster# 12  - Total words: 25
Cluster# 13  - Total words: 18
Cluster# 14  - Total words: 10
Cluster# 15  - Total words: 15
Cluster# 16  - Total words: 22
Cluster# 17  - Total words: 27
Cluster# 18  - Total words: 18
Cluster# 19  - Total words: 17
Cluster# 20  - Total words: 34
Cluster# 21  - Total words: 21
Cluster# 22  - Total words: 21
Cluster# 23  - Total words: 13
Cluster# 24  - Total words: 16
Cluster# 25  - Total words: 23
Cluster# 26  - Total words: 18
Cluster# 27  - Total words: 23
Cluster# 28  - Total words: 30
Cluster# 29  - Total words: 9
Cluster# 30  - Total words: 18
Cluster# 31  - Total words: 24
Cluster# 32  - Tot

In [None]:
display_summary(bow_clusters)

Cluster# 0  - Total words: 1
Cluster# 1  - Total words: 5070
Cluster# 2  - Total words: 5
Cluster# 3  - Total words: 1
Cluster# 4  - Total words: 1
Cluster# 5  - Total words: 1
Cluster# 6  - Total words: 1
Cluster# 7  - Total words: 1
Cluster# 8  - Total words: 1
Cluster# 9  - Total words: 1
Cluster# 10  - Total words: 6
Cluster# 11  - Total words: 1
Cluster# 12  - Total words: 5
Cluster# 13  - Total words: 6
Cluster# 14  - Total words: 1
Cluster# 15  - Total words: 1
Cluster# 16  - Total words: 5
Cluster# 17  - Total words: 1
Cluster# 18  - Total words: 1
Cluster# 19  - Total words: 1
Cluster# 20  - Total words: 4
Cluster# 21  - Total words: 3
Cluster# 22  - Total words: 1
Cluster# 23  - Total words: 1
Cluster# 24  - Total words: 7
Cluster# 25  - Total words: 1
Cluster# 26  - Total words: 4
Cluster# 27  - Total words: 1
Cluster# 28  - Total words: 1
Cluster# 29  - Total words: 6
Cluster# 30  - Total words: 7
Cluster# 31  - Total words: 7
Cluster# 32  - Total words: 1
Cluster# 33  - To

In [None]:
df_test = df_test.copy()

In [None]:
def annotate_dataframe(clusters: KMeans, df: pd.DataFrame, col_name: str):
    cluster_count = Counter(sorted(clusters.labels_))
    #sort cluster centers by proximity to centroid
    order_centroids = clusters.cluster_centers_.argsort()[:, ::-1] 

    clusters_df = np.zeros(len(df))
    
    for cluster_idx in cluster_count:
        # get words inside each cluster
        cluster_words = np.where(clusters.labels_ == cluster_idx)[0]
        # anotate all chats in cluster
        for idx in cluster_words:
            clusters_df[idx] = int(cluster_idx)

    df[col_name] = clusters_df

In [None]:
annotate_dataframe(bow_clusters, df_test, 'bow_clusters')
annotate_dataframe(embd_clusters, df_test, 'embd_clusters')

In [None]:
bow_group = df_test.groupby('bow_clusters')
embd_group = df_test.groupby('embd_clusters')

In [None]:
bow_group.get_group(36)['text']

278     If I roll a one
1225    lol i win game 
1323          comend me
1766       unbelievable
2132        Forgot game
2174              fast 
2726              gg wp
3276               HELP
3481                 XD
4650            fuckers
4910              GG WP
Name: text, dtype: object

In [None]:
embd_group.get_group(38)['text']

54                 free farming ls
139              for farming a lot
175                 happy new year
192            passive heross here
226                           lmao
277           its our midlaner lol
294                             ff
381                 guys wtf is up
415                            lol
466                         archon
483                          right
500                           ggwp
517               u just like bara
568     Worst country of the world
585         what does the rat say?
689                            me?
723                Merry Christmas
757                          i win
774                     these guys
876            16 kills still lose
929                  and feels bad
946        or do you want gay porn
997              must chat all now
1014                           Dog
1048                           ???
1135      true sight is 100 longer
1203                   small favor
1220           impossible game gg 
1288                