# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
Date: 24 Mar, 2020 (Start)  

In [3]:
import re
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [4]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        self.dataset = "data/train_set_covid.csv"
        self.vector_size = 100
        self.window = 5
        self.max_vocab_size=None
        self.min_count = 5
        self.repr_method = "doc2vec"
args=Args()

In [5]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

## Reading Dataset

In [6]:
dataset = pd.read_csv(args.dataset)

In [7]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question
0,2020-03-22 10:55:35.958,dfc284f8-e162-44fe-b6ab-4dc25083293e,es,Are you sick?
1,2020-03-21 22:41:05.702,7a8ef480-4e63-429e-b7ff-4763ad47cfe2,pt,When should I go to the hospital?
2,2020-03-22 20:44:40.577,b3923f7c-ea90-4af2-94c5-5ee8736975fd,ko,I think I have a fever. What do I do?
3,2020-03-21 23:21:56.689,8e750805-9fc2-4702-a244-0b7e67cf29ad,vi,1+1
4,2020-03-22 11:24:07.614,7977f4b9-e495-45f7-a5ca-841359f618ac,es,What's covid19?


## Preprocess

In [8]:
# Will add later, right now, leaving it to the utils doc2vec

In [9]:
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"

In [10]:
from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

In [11]:
dataset = dataset[dataset.detected_lang == "en"]

## Doc2Vec

In [12]:
model = utils.text.representation.Doc2Vec(args).initialize(data = [dataset.question])

Training Doc2Vec with 684 Texts 


In [13]:
dataset["embedding"] = model.generate_embedding(dataset.question, returnarray=False)

### Kmeans

In [14]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(list(dataset["embedding"]))

In [15]:
dataset["cluster_label"] = list(kmeans.labels_)

In [16]:
kmeans.cluster_centers_

array([[ 4.88135032e-04,  2.15189368e-03,  1.02763379e-03,
         4.48831823e-04, -7.63452030e-04,  1.45894114e-03,
        -6.24127861e-04,  3.91772995e-03,  4.63662762e-03,
        -1.16558478e-03,  2.91725039e-03,  2.88949203e-04,
         6.80445635e-04,  4.25596628e-03, -4.28963965e-03,
        -4.12870711e-03, -4.79781581e-03,  3.32619855e-03,
         2.78156740e-03,  3.70012154e-03,  4.78618359e-03,
         2.99158553e-03, -3.85206367e-04,  2.80529168e-03,
        -3.81725584e-03,  1.39921019e-03, -3.56646720e-03,
         4.44668904e-03,  2.18483212e-04, -8.53380596e-04,
        -2.35444377e-03,  2.74233683e-03, -4.38496674e-04,
         6.84339495e-04, -4.81210183e-03,  1.17635494e-03,
         1.12095720e-03,  1.16933999e-03,  4.43748059e-03,
         1.81820302e-03, -1.40492094e-03, -6.29680464e-04,
         1.97631191e-03, -4.39774524e-03,  1.66766718e-03,
         1.70637865e-03, -2.89617432e-03, -3.71073699e-03,
        -1.84571650e-03, -1.36289233e-03,  7.01967685e-0

In [None]:
dataset.sort_values("cluster_label")

In [82]:
clustered = dataset.reset_index().pivot(index="index",columns="cluster_label", values="question").fillna("-")

In [None]:
pd.DataFrame(dataset.groupby("ahc_label")["question"].apply(list).values.tolist()).T

In [83]:
clustered.to_csv("output/clustered.csv")

### Agglomerative Clustering

In [144]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = pd.DataFrame(dataset["embedding"].values.tolist(), index= dataset.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=0.12).fit(X)
dataset["ahc_label"] = clustering.labels_

# dataset.sort_values("ahc_label")[["question","ahc_label"]]

In [145]:
dataset.groupby("ahc_label").count()

Unnamed: 0_level_0,timestamp_est,anonymous_id,language,question,detected_lang,embedding
ahc_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,640,640,640,640,640,640
1,27,27,27,27,27,27
2,23,23,23,23,23,23


In [None]:
for label in dataset.ahc_label.unique():
    print(dataset[dataset.ahc_label==label][:10].question)

In [31]:
dataset.sort_values("ahc_label")[["question","ahc_label"]].to_csv("output/doc2vec_ahc.csv")

In [33]:
dataset.reset_index().pivot(index="index",columns="ahc_label", values="question").fillna("-").to_csv("output/doc2vec_ahc_cluster_view.csv")

In [148]:
dataset_2 = dataset[dataset.ahc_label == 0]

In [None]:
X = pd.DataFrame(dataset_2["embedding"].values.tolist(), index= dataset_2.index).to_numpy()
# lv2 threshold backup 0.061
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=0.055).fit(X)
dataset_2["ahc_label_lv2"] = clustering.labels_
# dataset_2.sort_values("ahc_label_lv2")[["question","ahc_label_lv2"]].head()
dataset_2.groupby("ahc_label_lv2")["question"].count().sort_values()

In [None]:
dataset_2.sort_values("ahc_label_lv2")

In [155]:
group_counts = dataset_2.groupby("ahc_label_lv2")["question"].count().sort_values().to_frame(name="count")

In [112]:
# dataset_2.join(group_counts, on="ahc_label_lv2").sort_values(["count","ahc_label_lv2"],ascending=True)[["question","ahc_label","ahc_label_lv2"]].to_csv("output/doc2vec_ahc_lv2.csv")

In [None]:
dataset_2.reset_index().pivot(index="index",columns="ahc_label_lv2", values="question").fillna("-")

In [162]:
temp = dataset_2.join(group_counts, on="ahc_label_lv2").sort_values(["count","ahc_label_lv2"],ascending=True)[["question","ahc_label","ahc_label_lv2","count"]]

In [163]:
temp.head()

Unnamed: 0,question,ahc_label,ahc_label_lv2,count
213,How does it spread ?,0,20,3
284,How does COVID-19 spread?,0,20,3
1055,How does Covid-19 spread?,0,20,3
619,What is the incubation period for COVID-19?,0,27,3
659,What’s the incubation period?,0,27,3


In [157]:
dataset = dataset.join(temp["ahc_label_lv2"])
dataset["ahc_label_lv2"] = dataset["ahc_label_lv2"].fillna(value=0)

In [160]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,embedding,ahc_label,ahc_label_lv2
0,2020-03-22 10:55:35.958,dfc284f8-e162-44fe-b6ab-4dc25083293e,es,Are you sick?,en,"[0.003665695, -0.0012658612, -0.004298671, 0.0...",0,8.0
1,2020-03-21 22:41:05.702,7a8ef480-4e63-429e-b7ff-4763ad47cfe2,pt,When should I go to the hospital?,en,"[0.004421982, -0.0015855284, 1.4623756e-05, -0...",0,28.0
2,2020-03-22 20:44:40.577,b3923f7c-ea90-4af2-94c5-5ee8736975fd,ko,I think I have a fever. What do I do?,en,"[2.2750562e-05, -0.004664215, -0.0045801885, -...",0,4.0
4,2020-03-22 11:24:07.614,7977f4b9-e495-45f7-a5ca-841359f618ac,es,What's covid19?,en,"[-0.004842487, -0.0034736327, 0.00083430874, 0...",0,32.0
5,2020-03-22 16:34:39.551,4b388591-5ea5-401f-b387-9a4e6be70a91,pt,Does the virus spread by rain?,en,"[-0.0015365956, 0.0011491243, 0.0013140386, -0...",0,11.0


In [166]:
group_counts = dataset.groupby(["ahc_label","ahc_label_lv2"])["question"].count().sort_values().to_frame(name="count")

In [168]:
dataset.join(group_counts, on=["ahc_label","ahc_label_lv2"]).sort_values(["count","ahc_label_lv2"],ascending=True)[["question","ahc_label","ahc_label_lv2"]].to_csv("output/doc2vec_ahc_lv2.csv")

In [None]:
dataset.groupby(["ahc_label","ahc_label_lv2"]).count()

# Level 2 with Rules Based

In [19]:
dataset = pd.read_csv("output/text_rules_based_lv2.csv").fillna("unclassified")
dataset = dataset[["question", "cluster"]]
d = dataset[dataset.cluster=="unclassified"]
d.head()

Unnamed: 0,question,cluster
0,Are research fundings going to be cut as a res...,unclassified
1,Is it ok to go outside to get fresh air,unclassified
2,Should I star home and even avoid going to the...,unclassified
3,Are we going to have trouble buying milk,unclassified
4,Will Covid19 go away like the 1918 influenza a...,unclassified


In [20]:
model = utils.text.representation.Doc2Vec(args).initialize(data = [d.question])
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

Training Doc2Vec with 250 Texts 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [25]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=0.05).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 32 clusters


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


ahc_label
28     2
21     2
17     2
15     4
27     4
23     4
30     4
20     5
0      6
31     7
26     7
24     7
22     7
10     8
11     8
29     8
13     8
25     8
4      9
5      9
16     9
18    10
3     10
7     10
14    10
12    10
19    10
1     11
8     12
2     12
9     12
6     15
Name: question, dtype: int64

In [26]:
for label in d.ahc_label.unique():
    print(f"cluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(d[d.ahc_label==label][:10].question)

cluster #24, count - 7
0      Are research fundings going to be cut as a res...
17     Is it true that COVID 19 can survive up to a f...
143    Will the heat of the summer help fighting the ...
152            What are the best food to get good immune
214    Does washing clothes in the washing machine pr...
303               How often should i take my temperature
331                        Why is it called Corona virus
Name: question, dtype: object
cluster #6, count - 15
1                Is it ok to go outside to get fresh air
8                                           how is going
27     Will herd immunity be the only means of eradic...
91                                   Is the virus deadly
94                           How did this virus transmit
107                                How is it transmitted
145                                     I think I m sick
193                     is it made by human or by nature
232    What s the point of asking you questions if yo...
266         