<a href="https://colab.research.google.com/github/csralvall/online_game_toxicity/blob/main/clustering_bow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import utility functions

In [1]:
from IPython.display import clear_output

### Install dependencies

In [2]:
!pip install -U kaggle pip setuptools wheel pandas sklearn numpy wget mr4mp
clear_output()

### Import libraries

In [3]:
from google.colab import drive, files
import pandas as pd
import spacy
import numpy as np
import pickle
import mr4mp
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from timeit import default_timer
from collections import Counter

In [52]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### Mount storage

In [4]:
# mount google drive unit to save computationally expensive results
drive.mount('/content/drive')

Mounted at /content/drive


### Get subset of whole dataset

In [5]:
# english chats from original dataset with anotations
eng_annotated = '/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv'
df_test = pd.read_csv(eng_annotated)[:10000]

### Get bad word list from memory

In [19]:
# get downloaded bad word list
word_list = "/content/drive/MyDrive/nlp/bad_words.txt"
# use set for fast queries
bad_words = set(line.strip() for line in open(word_list, 'r'))
# add new bad words
bad_words.update(['noob', 'noobs', 'stfu', 'fukign', 'fuking', 'fukin', 'nooob'])
bad_dict = dict.fromkeys(bad_words, 0)

### Create Bag of Words (BOW)

In [10]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [11]:
# function to transform chats in sets of words
def chat_to_set(chat: [str]) -> {str}:
    return set(chat.split())

# function to join all chat sets in one big set
def join_chat_sets(chat: {str},bag: {str}) -> {str}:
    return bag.union(chat)

In [12]:
# from cleaned english chats get all of them without nan values
chats = df_test[['tokens']].dropna().astype(str).values

In [13]:
chats = flatten(chats)

In [14]:
# use map reduce model to create the Bag of Words (BOW)
start = default_timer()
pool = mr4mp.pool(10) # roughly 1hs with gpu with full eng dataset
set_of_words = pool.mapreduce(chat_to_set, join_chat_sets, chats)
pool.close()
bag_of_words = dict.fromkeys(set_of_words, 0)
print("Finished in " + str(default_timer()-start) + "s using " + str(len(pool)) + " process(es).")

Finished in 0.27319918500000995s using 10 process(es).


In [None]:
# save bag of words in drive (very expensive to compute)
# use when running code with full dataset
with open('/content/drive/MyDrive/nlp/bag_of_words.pkl', 'wb') as dict_file:
    pickle.dump(bag_of_words, dict_file)
    dict_file.close()

#### Clustering utils

In [15]:
def get_bad_vec(lexicon, chat_words):
    for word in chat_words:
        if word in lexicon:
            bad_dict[word] += 1

    bad_vec = np.fromiter(bad_dict.values(), dtype=int)
    
    return bad_vec

In [16]:
def get_bow_vec(bow, chat_words):
    for word in word_list:
        if word in bow:
            bow[word] += 1

    bow_vec = np.fromiter(bow.values(), dtype=int)

    return bow_vec

In [17]:
intensity = df_test[['intensity']].copy()

### Create bow vectors for clustering

In [20]:
for idx, chat in enumerate(chats):
    bow = dict.fromkeys(bag_of_words, 0)
    lexicon = dict.fromkeys(bad_words, 0)
    chat_intensity = intensity.loc[idx]
    chat_words = chat.split()
    bad_vec = get_bad_vec(lexicon, chat_words)
    bow_vec = get_bow_vec(bow, chat_words)
    bow_bad_int_vec = np.hstack((bow_vec, bad_vec, chat_intensity)).ravel()
    if idx == 0:
        bow_serie = bow_bad_int_vec
    else:
        bow_serie = np.concatenate((bow_serie, bow_bad_int_vec))


### Reshape data

In [25]:
bow_ncolumns = len(set_of_words) + len(bad_words) + 1
bow_serie = bow_serie.astype('float').reshape((-1, bow_ncolumns))
bow_matrix = bow_serie[~np.isnan(bow_serie)].reshape((-1, bow_ncolumns))

### Matrix reduction

In [26]:
def reduce_matrix(matrix: np.ndarray, *, variance_treshold: float):
    print(f'INPUT SHAPE: {matrix.shape}')
    # reduce all vectors to [0, 1] space
    normalized_matrix = normalize(matrix, axis=1)
    # compute variances in each row
    matrix_variances = np.var(matrix, axis=0)
    # create mask for features with high correlation (low variance)
    bool_mask = np.where(matrix_variances < variance_treshold)
    # filter features with high correlation (variance under treshold)
    raked_matrix = np.delete(normalized_matrix, bool_mask, axis=1)
    print(f'OUTPUT SHAPE: {raked_matrix.shape}')
    return raked_matrix

### Reduce matrix

In [27]:
bow_reduced = reduce_matrix(bow_matrix, variance_treshold=0.0001)

INPUT SHAPE: (8157, 5228)
OUTPUT SHAPE: (8157, 141)


### Save matrix

In [28]:
with open('/content/drive/MyDrive/nlp/bow_serie_10000.npy', 'wb') as output_file:
    np.save(output_file, bow_matrix)

### Load matrix from storage

In [29]:
bow_serie = np.load('/content/drive/MyDrive/nlp/bow_serie_10000.npy')

#### Clustering

In [30]:
def generate_clusters(
    matrix: np.ndarray,
    n_clusters: int
) -> KMeans:
    # generate word clusters using the KMeans algorithm.
    print("\nClustering started")
    # Instantiate KMeans clusterer for n_clusters
    km_model = KMeans(n_clusters=n_clusters, random_state=3)
    # create clusters
    km_model.fit(matrix)
    print("Clustering finished")
    return km_model

### Create clusters

In [31]:
bow_clusters = generate_clusters(bow_serie, 50)


Clustering started
Clustering finished


### Cluster utility functions

In [32]:
def display_summary(clusters: KMeans):
    cluster_count = Counter(sorted(clusters.labels_))
    for cluster in cluster_count:
        print ("Cluster#", cluster," - Total words:", cluster_count[cluster])

In [50]:
def annotate_dataframe(clusters: KMeans, df: pd.DataFrame, col_name: str):
    cluster_count = Counter(sorted(clusters.labels_))
    #sort cluster centers by proximity to centroid
    order_centroids = clusters.cluster_centers_.argsort()[:, ::-1] 

    clusters_df = np.zeros(len(df))
    
    for cluster_idx in cluster_count:
        # get words inside each cluster
        cluster_words = np.where(clusters.labels_ == cluster_idx)[0]
        # anotate all chats in cluster
        for idx in cluster_words:
            clusters_df[idx] = int(cluster_idx)

    df[col_name] = clusters_df

### Show info about clusters

In [51]:
display_summary(bow_clusters)

Cluster# 0  - Total words: 234
Cluster# 1  - Total words: 144
Cluster# 2  - Total words: 128
Cluster# 3  - Total words: 190
Cluster# 4  - Total words: 156
Cluster# 5  - Total words: 236
Cluster# 6  - Total words: 225
Cluster# 7  - Total words: 204
Cluster# 8  - Total words: 108
Cluster# 9  - Total words: 183
Cluster# 10  - Total words: 186
Cluster# 11  - Total words: 227
Cluster# 12  - Total words: 211
Cluster# 13  - Total words: 150
Cluster# 14  - Total words: 210
Cluster# 15  - Total words: 164
Cluster# 16  - Total words: 90
Cluster# 17  - Total words: 215
Cluster# 18  - Total words: 126
Cluster# 19  - Total words: 148
Cluster# 20  - Total words: 134
Cluster# 21  - Total words: 188
Cluster# 22  - Total words: 210
Cluster# 23  - Total words: 224
Cluster# 24  - Total words: 137
Cluster# 25  - Total words: 144
Cluster# 26  - Total words: 211
Cluster# 27  - Total words: 126
Cluster# 28  - Total words: 206
Cluster# 29  - Total words: 201
Cluster# 30  - Total words: 141
Cluster# 31  - Tota

In [34]:
df_test = df_test.copy()

### Annotate cluster for each row in dataframe

In [37]:
annotate_dataframe(bow_clusters, df_test, 'bow_clusters')

In [38]:
df_test.to_csv(f'/content/drive/MyDrive/nlp/bow_clusters_df.csv', index=False)

### Explore results

In [39]:
bow_group = df_test.groupby('bow_clusters')

#### Get toxicity score for each cluster

In [49]:
bow_group['toxicity'].sum() / bow_group.size()

bow_clusters
0.0     0.125181
1.0     0.083333
2.0     0.164062
3.0     0.047368
4.0     0.128205
5.0     0.063559
6.0     0.111111
7.0     0.122549
8.0     0.148148
9.0     0.098361
10.0    0.155914
11.0    0.158590
12.0    0.090047
13.0    0.073333
14.0    0.114286
15.0    0.054878
16.0    0.111111
17.0    0.125581
18.0    0.126984
19.0    0.108108
20.0    0.089552
21.0    0.106383
22.0    0.185714
23.0    0.098214
24.0    0.087591
25.0    0.097222
26.0    0.104265
27.0    0.055556
28.0    0.135922
29.0    0.099502
30.0    0.085106
31.0    0.111111
32.0    0.054054
33.0    0.071161
34.0    0.163636
35.0    0.067797
36.0    0.126316
37.0    0.069767
38.0    0.148148
39.0    0.088235
40.0    0.128571
41.0    0.078947
42.0    0.054545
43.0    0.190476
44.0    0.136585
45.0    0.131429
46.0    0.114583
47.0    0.060241
48.0    0.105590
49.0    0.091954
dtype: float64

#### Explore clusters

In [53]:
bow_group.get_group(11)['text']

1535    and Io just doenst available on mid cuz like that
1536                       ruins every single game hes in
1537                  this qop is just ruining the moment
1538                                      i had him muted
1539    he just dont realize omni is the worst 5 u can...
1540                                     what did he say?
1541                                              nothing
1542         gg this omni watches too much fucking slacks
1543                                              gl next
1544                                      my fucking tide
1545                                              running
1546                                               he not
1547                                         even hitting
1548                                                  why
1549                                                   5k
1550                                             all suck
1551                                          same like u
1552          