<a href="https://colab.research.google.com/github/csralvall/online_game_toxicity/blob/main/clustering_ftt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import utility functions

In [1]:
from IPython.display import clear_output

### Install dependencies

In [2]:
# install dependencies
!pip install -U pip setuptools wheel pandas numpy gensim wget
clear_output()

### Import libraries

In [3]:
# import libraries
from google.colab import drive
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from collections import Counter
from gensim.models import FastText

### Setup dataframe print options

In [4]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### Mount storage

In [5]:
# mount google drive unit to save computationally expensive results
drive.mount('/content/drive')

Mounted at /content/drive


### Load subset of whole dataset from storage

In [6]:
# english chats from original dataset with anotations
eng_annotated = '/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv'
df_test = pd.read_csv(eng_annotated)[:10000]

### Get bad word list from memory

In [8]:
# get downloaded bad word list
word_list = "/content/drive/MyDrive/nlp/bad_words.txt"
# use set for fast queries
bad_words = set(line.strip() for line in open(word_list, 'r'))
# add new bad words
bad_words.update(['noob', 'noobs', 'stfu', 'fukign', 'fuking', 'fukin', 'nooob'])
bad_dict = dict.fromkeys(bad_words, 0)

# Some experiments with bigrams

In [None]:
from nltk import word_tokenize 
from nltk.util import ngrams
import nltk
nltk.download('punkt')

bigrams = []
trigrams = []
for line in chats:
    token = line.split()
    bigrams.append(list(map(lambda x: '_'.join(x), ngrams(token, 2))))
    trigrams.append(list(map(lambda x: '_'.join(x), ngrams(token, 3))))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
len(trigrams)

8157

In [None]:
from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=5,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

bigrams = build_phrases(chats)

In [None]:
bigrams = list(map(lambda x: [x], bigrams[chats]))

In [None]:
w2v_bi = generate_embedding(bigrams)

In [None]:
w2v_bi.wv.most_similar('report')

[('nice', 0.2529045641422272),
 ('carry', 0.20082908868789673),
 ('ggwp', 0.17018888890743256),
 ('team', 0.15016482770442963),
 ('mid', 0.13887985050678253),
 ('feed', 0.10852647572755814),
 ('guys', 0.09936434030532837),
 ('commend', 0.03476494550704956),
 ('def', 0.0330718494951725),
 ('win', 0.019886532798409462)]

# Generate word embeddings

#### Fasttext embeddings:

In [9]:
# function to create embeddings of words in each chat
def generate_embedding_ftt(sentences: [[str]]):
  ftt_model = FastText(
      sentences=sentences,
      vector_size=100,
      window=5,
      min_count=1,
      workers=1
  )

  ftt_model.build_vocab(sentences, progress_per=10000)

  ftt_model.train(sentences, total_examples=len(sentences), epochs=100)

  return ftt_model

### Unroll chats as list of words

In [10]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [12]:
# from cleaned english chats get all of them without nan values
chats = df_test[['tokens']].dropna().astype(str).values
chats = flatten(chats)
vocab = list(map(lambda x: x.split(), chats))

#### Generate word embeddings from chat

In [13]:
ftt_model = generate_embedding_ftt(vocab)

In [None]:
# save FastText model
ftt_model.save('/content/drive/MyDrive/nlp/fasttext.model')

#### Load word embeddings from disk

In [14]:
# load FastText model
ftt_model = FastText.load('/content/drive/MyDrive/nlp/fasttext.model')

#### Embeddings utils

In [15]:
# generate embedding from chat
def chat_embedding(model, chat_words):
    chat_embedding = np.ones(100)
    for word in chat_words:
        if word in model.wv:
            chat_embedding *= model.wv[word] 
    return chat_embedding

#### Clustering utils

In [16]:
def get_bad_vec(lexicon, chat_words):
    for word in chat_words:
        if word in lexicon:
            bad_dict[word] += 1

    bad_vec = np.fromiter(bad_dict.values(), dtype=int)
    
    return bad_vec

In [17]:
def get_bow_vec(bow, chat_words):
    for word in word_list:
        if word in bow:
            bow[word] += 1

    bow_vec = np.fromiter(bow.values(), dtype=int)

    return bow_vec

In [18]:
intensity = df_test[['intensity']].copy()

### Create ftt vectors for clustering

In [19]:
ftt_serie = np.array([])
for idx, chat in enumerate(chats):
    lexicon = dict.fromkeys(bad_words, 0)
    chat_intensity = intensity.loc[idx]
    chat_words = chat.split()
    bad_vec = get_bad_vec(lexicon, chat_words)
    ftt_embedding = chat_embedding(ftt_model, chat_words)
    ftt_bad_int_vec = np.hstack((ftt_embedding, bad_vec, chat_intensity)).ravel()
    ftt_serie = np.concatenate((ftt_serie, ftt_bad_int_vec))

#### Reshape data

In [20]:
ftt_ncolumns = ftt_model.wv.vectors.shape[1] + len(bad_words) + 1
ftt_serie = ftt_serie.astype('float').reshape((-1, ftt_ncolumns))
ftt_matrix = ftt_serie[~np.isnan(ftt_serie)].reshape((-1, ftt_ncolumns))

#### Matrix reduction

In [21]:
def reduce_matrix(matrix: np.ndarray, *, variance_treshold: float):
    print(f'INPUT SHAPE: {matrix.shape}')
    # reduce all vectors to [0, 1] space
    normalized_matrix = normalize(matrix, axis=1)
    # compute variances in each row
    matrix_variances = np.var(matrix, axis=0)
    # create mask for features with high correlation (low variance)
    bool_mask = np.where(matrix_variances < variance_treshold)
    # filter features with high correlation (variance under treshold)
    raked_matrix = np.delete(normalized_matrix, bool_mask, axis=1)
    print(f'OUTPUT SHAPE: {raked_matrix.shape}')
    return raked_matrix

In [22]:
ftt_reduced = reduce_matrix(ftt_matrix, variance_treshold=0.01)

INPUT SHAPE: (8157, 1492)
OUTPUT SHAPE: (8157, 239)


### Save matrix

In [23]:
with open('/content/drive/MyDrive/nlp/ftt_serie_10000.npy', 'wb') as output_file:
    np.save(output_file, ftt_reduced)

### Load matrix from storage

In [24]:
ftt_reduced = np.load('/content/drive/MyDrive/nlp/ftt_serie_10000.npy')

### Clustering

In [25]:
def generate_clusters(
    matrix: np.ndarray,
    n_clusters: int
) -> KMeans:
    # generate word clusters using the KMeans algorithm.
    print("\nClustering started")
    # Instantiate KMeans clusterer for n_clusters
    km_model = KMeans(n_clusters=n_clusters, random_state=3)
    # create clusters
    km_model.fit(matrix)
    print("Clustering finished")
    return km_model

### Create clusters

In [26]:
ftt_clusters = generate_clusters(ftt_reduced, 50)


Clustering started
Clustering finished


### Cluster utility functions

In [27]:
def display_summary(clusters: KMeans):
    cluster_count = Counter(sorted(clusters.labels_))
    for cluster in cluster_count:
        print ("Cluster#", cluster," - Total words:", cluster_count[cluster])

In [28]:
def annotate_dataframe(clusters: KMeans, df: pd.DataFrame, col_name: str):
    cluster_count = Counter(sorted(clusters.labels_))
    #sort cluster centers by proximity to centroid
    order_centroids = clusters.cluster_centers_.argsort()[:, ::-1] 

    clusters_df = np.zeros(len(df))
    
    for cluster_idx in cluster_count:
        # get words inside each cluster
        cluster_words = np.where(clusters.labels_ == cluster_idx)[0]
        # anotate all chats in cluster
        for idx in cluster_words:
            clusters_df[idx] = int(cluster_idx)

    df[col_name] = clusters_df

### Show info about clusters

In [29]:
display_summary(ftt_clusters)

Cluster# 0  - Total words: 1620
Cluster# 1  - Total words: 42
Cluster# 2  - Total words: 293
Cluster# 3  - Total words: 16
Cluster# 4  - Total words: 61
Cluster# 5  - Total words: 16
Cluster# 6  - Total words: 32
Cluster# 7  - Total words: 26
Cluster# 8  - Total words: 565
Cluster# 9  - Total words: 18
Cluster# 10  - Total words: 20
Cluster# 11  - Total words: 47
Cluster# 12  - Total words: 34
Cluster# 13  - Total words: 2018
Cluster# 14  - Total words: 375
Cluster# 15  - Total words: 11
Cluster# 16  - Total words: 62
Cluster# 17  - Total words: 38
Cluster# 18  - Total words: 333
Cluster# 19  - Total words: 19
Cluster# 20  - Total words: 121
Cluster# 21  - Total words: 17
Cluster# 22  - Total words: 47
Cluster# 23  - Total words: 164
Cluster# 24  - Total words: 45
Cluster# 25  - Total words: 8
Cluster# 26  - Total words: 600
Cluster# 27  - Total words: 58
Cluster# 28  - Total words: 5
Cluster# 29  - Total words: 34
Cluster# 30  - Total words: 28
Cluster# 31  - Total words: 27
Cluster# 

### Annotate cluster for each row in dataframe

In [30]:
df_test = df_test.copy()

In [31]:
annotate_dataframe(ftt_clusters, df_test, 'ftt_clusters')

In [32]:
df_test.to_csv(f'/content/drive/MyDrive/nlp/ftt_clusters_df.csv', index=False)

### Explore results

In [33]:
ftt_group = df_test.groupby('ftt_clusters')

Get toxicity score for each cluster

In [34]:
ftt_group['toxicity'].sum() / ftt_group.size()

ftt_clusters
0.0     0.114352
1.0     0.214286
2.0     0.109215
3.0     0.062500
4.0     0.213115
5.0     0.125000
6.0     0.125000
7.0     0.000000
8.0     0.097345
9.0     0.055556
10.0    0.050000
11.0    0.106383
12.0    0.117647
13.0    0.107532
14.0    0.109333
15.0    0.363636
16.0    0.112903
17.0    0.131579
18.0    0.093093
19.0    0.157895
20.0    0.132231
21.0    0.176471
22.0    0.106383
23.0    0.170732
24.0    0.066667
25.0    0.000000
26.0    0.095000
27.0    0.017241
28.0    0.000000
29.0    0.058824
30.0    0.142857
31.0    0.037037
32.0    0.094374
33.0    0.110497
34.0    0.228571
35.0    0.333333
36.0    0.051282
37.0    0.000000
38.0    0.000000
39.0    0.236842
40.0    0.000000
41.0    0.102273
42.0    0.076923
43.0    0.139706
44.0    0.103448
45.0    0.000000
46.0    0.043478
47.0    0.111111
48.0    0.175439
49.0    0.096774
dtype: float64

### Explore clusters

In [35]:
ftt_group.get_group(23)['text']

494                                           shutup nerd
507                                            then i ded
509                                             fair play
524                                                 chill
531                                               a trick
536                                                    Oh
542                               kewl you still are 2-10
548                                                  I am
549                                        Skeleton pussy
555                          little fairplay rusians shit
556             well if you remuse it will be uncount map
587                                                  lmao
598                            Cant even bark to make fun
632                                          so nice game
640                                                     (
641                                                   wtf
658                                                     s
669           