<a href="https://colab.research.google.com/github/csralvall/online_game_toxicity/blob/main/clustering_w2v.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import utility functions

In [1]:
from IPython.display import clear_output

### Install dependencies

In [2]:
# install dependencies
!pip install -U pip setuptools wheel pandas sklearn numpy gensim wget
clear_output()

### Import libraries

In [3]:
# import libraries
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from collections import Counter
from gensim.models import Word2Vec

### Setup dataframe print options

In [52]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### Mount storage

In [4]:
# mount google drive unit to save computationally expensive results
drive.mount('/content/drive')

Mounted at /content/drive


### Load subset of whole dataset from storage

In [5]:
# english chats from original dataset with anotations
eng_annotated = '/content/drive/MyDrive/nlp/dota2_chat_eng_annotated.csv'
df_test = pd.read_csv(eng_annotated)[:10000]

In [6]:
df_test.head()

Unnamed: 0,match,time,slot,text,language,clean,tokens,intensity,toxicity
0,0,1808.40822,9,100%,en,100,,0,0
1,1,-131.14018,0,twitch.tv/rage_channel,en,twitch rage channel,twitch rage channel,0,0
2,1,-121.60481,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0
3,1,700.72893,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0
4,1,702.99503,0,https://www.twitch.tv/rage_channel,en,https www twitch rage channel,https www twitch rage channel,0,0


### Get bad word list from memory

In [17]:
# get downloaded bad word list
word_list = "/content/drive/MyDrive/nlp/bad_words.txt"
# use set for fast queries
bad_words = set(line.strip() for line in open(word_list, 'r'))
# add new bad words
bad_words.update(['noob', 'noobs', 'stfu', 'fukign', 'fuking', 'fukin', 'nooob'])
bad_dict = dict.fromkeys(bad_words, 0)

#### Word2Vec embeddings:

In [7]:
# function to create embeddings of words in each chat
def generate_embedding_w2v(sentences: [[str]]):
  w2v_model = Word2Vec(
                     min_count=20,
                     window=2,
                     #size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=1)

  w2v_model.build_vocab(sentences, progress_per=10000)

  w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

  return w2v_model

### Unroll chats as list of words

In [8]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [9]:
# from cleaned english chats get all of them without nan values
chats = df_test[['tokens']].dropna().astype(str).values
chats = flatten(chats)
vocab = list(map(lambda x: x.split(), chats))

#### Generate word embeddings from chat

In [10]:
w2v_model = generate_embedding_w2v(vocab)

In [None]:
# save Word2Vec model
w2v_model.save('/content/drive/MyDrive/nlp/word2vec.model')

#### Load word embeddings from disk

In [12]:
# load Word2Vec model
w2v_model = Word2Vec.load('/content/drive/MyDrive/nlp/word2vec.model')

#### Embeddings utils

In [13]:
# generate embedding from chat
def chat_embedding(model, chat_words):
    chat_embedding = np.ones(100)
    for word in chat_words:
        if word in model.wv:
            chat_embedding *= model.wv[word] 
    return chat_embedding

#### Clustering utils

In [14]:
def get_bad_vec(lexicon, chat_words):
    for word in chat_words:
        if word in lexicon:
            bad_dict[word] += 1

    bad_vec = np.fromiter(bad_dict.values(), dtype=int)
    
    return bad_vec

In [15]:
intensity = df_test[['intensity']].copy()

#### Create w2v vectors for clustering

In [18]:
w2v_serie = np.array([])
for idx, chat in enumerate(chats):
    lexicon = dict.fromkeys(bad_words, 0)
    chat_intensity = intensity.loc[idx]
    chat_words = chat.split()
    bad_vec = get_bad_vec(lexicon, chat_words)
    w2v_embedding = chat_embedding(w2v_model, chat_words)
    w2v_bad_int_vec = np.hstack((w2v_embedding, bad_vec, chat_intensity)).ravel()
    w2v_serie = np.concatenate((w2v_serie, w2v_bad_int_vec))

#### Reshape data

In [19]:
w2v_ncolumns = w2v_model.wv.vectors.shape[1] + len(bad_words) + 1
w2v_serie = w2v_serie.astype('float').reshape((-1, w2v_ncolumns))
w2v_matrix = w2v_serie[~np.isnan(w2v_serie)].reshape((-1, w2v_ncolumns))

#### Matrix reduction

In [22]:
def reduce_matrix(matrix: np.ndarray, *, variance_treshold: float):
    print(f'INPUT SHAPE: {matrix.shape}')
    # reduce all vectors to [0, 1] space
    normalized_matrix = normalize(matrix, axis=1)
    # compute variances in each row
    matrix_variances = np.var(matrix, axis=0)
    # create mask for features with high correlation (low variance)
    bool_mask = np.where(matrix_variances < variance_treshold)
    # filter features with high correlation (variance under treshold)
    raked_matrix = np.delete(normalized_matrix, bool_mask, axis=1)
    print(f'OUTPUT SHAPE: {raked_matrix.shape}')
    return raked_matrix

### Reduce matrix

In [36]:
w2v_reduced = reduce_matrix(w2v_matrix, variance_treshold=0.01)

INPUT SHAPE: (8157, 1492)
OUTPUT SHAPE: (8157, 239)


### Save matrix

In [37]:
with open('/content/drive/MyDrive/nlp/w2v_serie_10000.npy', 'wb') as output_file:
    np.save(output_file, w2v_reduced)

### Load matrix from storage

In [39]:
w2v_reduced = np.load('/content/drive/MyDrive/nlp/w2v_serie_10000.npy')

### Clustering

In [40]:
def generate_clusters(
    matrix: np.ndarray,
    n_clusters: int
) -> KMeans:
    # generate word clusters using the KMeans algorithm.
    print("\nClustering started")
    # Instantiate KMeans clusterer for n_clusters
    km_model = KMeans(n_clusters=n_clusters, random_state=3)
    # create clusters
    km_model.fit(matrix)
    print("Clustering finished")
    return km_model

### Create clusters

In [41]:
w2v_clusters = generate_clusters(w2v_reduced, 40)


Clustering started
Clustering finished


### Cluster utility functions

In [42]:
def display_summary(clusters: KMeans):
    cluster_count = Counter(sorted(clusters.labels_))
    for cluster in cluster_count:
        print ("Cluster#", cluster," - Total words:", cluster_count[cluster])

In [43]:
def annotate_dataframe(clusters: KMeans, df: pd.DataFrame, col_name: str):
    cluster_count = Counter(sorted(clusters.labels_))
    #sort cluster centers by proximity to centroid
    order_centroids = clusters.cluster_centers_.argsort()[:, ::-1] 

    clusters_df = np.zeros(len(df))
    
    for cluster_idx in cluster_count:
        # get words inside each cluster
        cluster_words = np.where(clusters.labels_ == cluster_idx)[0]
        # anotate all chats in cluster
        for idx in cluster_words:
            clusters_df[idx] = int(cluster_idx)

    df[col_name] = clusters_df

### Show info about clusters

In [44]:
# show number of words captured by each cluster
display_summary(w2v_clusters)

Cluster# 0  - Total words: 118
Cluster# 1  - Total words: 408
Cluster# 2  - Total words: 121
Cluster# 3  - Total words: 151
Cluster# 4  - Total words: 369
Cluster# 5  - Total words: 8
Cluster# 6  - Total words: 317
Cluster# 7  - Total words: 43
Cluster# 8  - Total words: 1131
Cluster# 9  - Total words: 758
Cluster# 10  - Total words: 6
Cluster# 11  - Total words: 67
Cluster# 12  - Total words: 96
Cluster# 13  - Total words: 311
Cluster# 14  - Total words: 1
Cluster# 15  - Total words: 434
Cluster# 16  - Total words: 2
Cluster# 17  - Total words: 50
Cluster# 18  - Total words: 53
Cluster# 19  - Total words: 3
Cluster# 20  - Total words: 11
Cluster# 21  - Total words: 737
Cluster# 22  - Total words: 69
Cluster# 23  - Total words: 398
Cluster# 24  - Total words: 3
Cluster# 25  - Total words: 8
Cluster# 26  - Total words: 83
Cluster# 27  - Total words: 28
Cluster# 28  - Total words: 2
Cluster# 29  - Total words: 1
Cluster# 30  - Total words: 15
Cluster# 31  - Total words: 664
Cluster# 32  

### Annotate cluster for each row in dataframe

In [45]:
df_test = df_test.copy()

In [46]:
annotate_dataframe(w2v_clusters, df_test, 'w2v_clusters')

In [47]:
df_test.to_csv(f'/content/drive/MyDrive/nlp/w2v_clusters_df.csv', index=False)

### Explore results

In [48]:
w2v_group = df_test.groupby('w2v_clusters')

#### Get toxicity score for each cluster

In [49]:
w2v_group['toxicity'].sum() / w2v_group.size()

w2v_clusters
0.0     0.128506
1.0     0.134804
2.0     0.066116
3.0     0.092715
4.0     0.113821
5.0     0.125000
6.0     0.113565
7.0     0.023256
8.0     0.104332
9.0     0.129288
10.0    0.000000
11.0    0.059701
12.0    0.072917
13.0    0.138264
14.0    0.000000
15.0    0.085253
16.0    0.000000
17.0    0.120000
18.0    0.056604
19.0    0.333333
20.0    0.000000
21.0    0.099050
22.0    0.072464
23.0    0.130653
24.0    0.000000
25.0    0.250000
26.0    0.180723
27.0    0.107143
28.0    0.000000
29.0    0.000000
30.0    0.066667
31.0    0.099398
32.0    0.000000
33.0    0.111111
34.0    0.093156
35.0    0.125000
36.0    0.250000
37.0    0.185629
38.0    0.000000
39.0    0.078351
dtype: float64

### Explore clusters

In [54]:
w2v_group.get_group(23)['text']

1762                          can only farm and do noting
1763                                           like creep
1764                                                  end
1765                  cant believe how alch can have item
1766                                         unbelievable
1767                             Team can feed every noob
1768                               And let me die to ward
1769                                            best team
1770                                            easy gyro
1771                                            cant 1on5
1772                                                 mute
1773                                               report
1774                                      just impossible
1775    unbelievable how people can be so bad to lose ...
1776                            fucking trash of humanity
1777                                nice fair matchmaking
1778           give me the 4 worst players in all of dota
1779          