# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
Date: 24 Mar, 2020 (Start)  

In [1]:
import re
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils


# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [2]:
class Args:
    def __init__(self):
        self.dataset = "data/train_set_covid.csv"
        self.embedding_file = "/home/designer1/Documents/sunanda/Data/common/embeddings/glove.6B.100d.txt"
        self.n_clusters = 100
        self.cluster_function = "normavg"
args=Args()

In [3]:
def read_embeddings_to_dict(embedding_file, limit = None, sep = " ", key = 0):
    ''' Read the word embeddings from the file and converts it into df - GloVe by default '''
    
    # Load the embeddings
    embeddings = pd.read_csv(embedding_file, sep=sep, index_col=key, header=None, nrows = limit, quoting=csv.QUOTE_NONE)
    
    # converting to dict where the word is the key and the word embedding array is the value
    embeddings = embeddings.apply(np.asarray, axis = 1).to_dict()
    
    return embeddings


## Dataset

In [4]:
dataset = pd.read_csv(args.dataset)

# Language Detection
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"
    
from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

dataset = dataset[dataset.detected_lang == "en"]

In [5]:
# Read embeddings to dictionary
if args.embedding_file:
    embeddings_dict = read_embeddings_to_dict(args.embedding_file)

## WcDe and AHC

In [10]:
dataset = pd.read_csv("output/text_rules_based_plusplus_15lsa_0.3dt_71ahcx.csv").fillna("unclassified")
dataset = dataset[["question", "cluster"]]
dataset = dataset[dataset.cluster=="unclassified"]
dataset.head()

Unnamed: 0,question,cluster
0,where can I help,unclassified
1,Will the heat of the summer help fighting the ...,unclassified
2,Does eating Oranges daily help to fight this v...,unclassified
3,Is there any medication to take to help,unclassified
4,How long are you contagious if you have Corona...,unclassified


In [None]:
args.n_clusters = 100
# Initialize WcDe
model = utils.text.representation.WcDe().initialize(
                                                        data = [dataset.question], 
                                                        n_clusters = args.n_clusters,
                                                        embeddings = embeddings_dict,
                                                        removeStopwords = True
                                                    )
dataset["embedding"] = model.generate_embedding(dataset.question, returnarray=False)

In [None]:
args.dist_thresh = 2
# Cluster
X = pd.DataFrame(dataset["embedding"].values.tolist(), index= dataset.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
dataset["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(dataset["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
dataset.groupby("ahc_label")["question"].count().sort_values()

In [None]:
dataset.groupby("ahc_label")["question"].count().describe()

In [13]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [14]:
visual

Unnamed: 0,cluster_#0,cluster_#1,cluster_#2,cluster_#3,cluster_#4,cluster_#5,cluster_#6,cluster_#7,cluster_#8,cluster_#9,...,cluster_#34,cluster_#35,cluster_#36,cluster_#37,cluster_#38,cluster_#39,cluster_#40,cluster_#41,cluster_#42,cluster_#43
Count,6,5,6,6,13,26,21,17,9,8,...,1,1,4,2,2,1,1,3,2,7
0,How do I treat coronavirus,What is the advice surrounding outdoor exercis...,In this time of crisis is it okay to feed the ...,What is,What should I do,My head hurts my body aches a little and I fee...,What is the difference between a bacteria and ...,Will the heat of the summer help fighting the ...,Should I star home and even avoid going to the...,How severe is the virus,...,How about covid situations,Non,How Covid 19 started,Has the virus evolved,Can I fo,Hypertension,Mortality rate today,When this virus terminated,Does Wearing gloves help,How long are you contagious if you have Corona...
1,Is it true that warm kills Coronavirus,Are vet clinics still open,is it made by human or by nature,What is it,Should I not touch my mail,Does a mask actually help if I m not sick,Is it transmissible through anal sex,Are research fundings going to be cut as a res...,How s the shortness of breath,How contagious is the virus,...,-,-,how can one fight the virus,Corona virus characteristics,im sneezing should i be worried,-,-,When will the schools reopen,Should I wear mask,how are you
2,Will high temperature kill coronavirus,Are there still flights within Canada,What s the capital of Palestine,What the size,who can you do,Do we see a future where this virus won t be s...,I recently relocated to Quebec Can I request R...,As of today can we gather only 2,What is your name,is there a new form of corona virus,...,-,-,how is going,-,-,-,-,Will the groceries shut down,-,How are you
3,Is it true that warm kills Coronavirus,what is canada quarantine policy,What is the situation now in my country,How is covid transmitted,How does covid 19 feel,How long does the virus remain active and aliv...,Chlono,Is it safe to get a new job that deals with th...,WHat is your name,Is the virus deadly,...,-,-,How long can the virus stay in the air,-,-,-,-,-,-,How are you
4,Is the phlegm color suggestive of coronavirus,age wise statistics for covid 19,What is the situation right now in the world,How is it transmitted,My brother has COVID 19 What to do,how much fever should i tolerate before going ...,Will it get better in summer with higher tempe...,Is it safe to order food during the COVID 19 p...,why is my dick so big,How deadly is the virus,...,-,-,-,-,-,-,-,-,-,how old are you
5,Can anti viral drugs be used to treat the coro...,-,How much it s prevelent in Pakistan,What is the probability of dying,How do i knew if i got covid,hat should I do if my 2 year old child starts ...,What are the side effects of hydroxychloroquin...,Will herd immunity be the only means of eradic...,What s the point of asking you questions if yo...,How deadly is the virus,...,-,-,-,-,-,-,-,-,-,could we have done better
6,-,-,-,-,what do you know about COVID 19,If this is really deep learning how deep is yo...,Is it true that COVID 19 can survive up to a f...,Is it useful to wear a mask if we are not sick,Why is it called Corona virus,Does the fever persists in corona,...,-,-,-,-,-,-,-,-,-,When will be done
7,-,-,-,-,How often should i take my temperature,What should i do if i am short of breath,How effective is the new drug manufactured by ...,Could stores open a few hours a week for the e...,what is canada s situation right now,Is there more than one strain of COVID 19,...,-,-,-,-,-,-,-,-,-,-
8,-,-,-,-,How i know that i m ill,When do I expect to hear back from you,Covid19,It would possible to predict the time of this ...,Why is called COVID 19,-,...,-,-,-,-,-,-,-,-,-,-


In [55]:
dataset.sort_values("ahc_label")[["question","ahc_label"]].to_csv(f"output/{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")
visual.to_csv(f"output/clusters_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

# LEVEL 2

In [228]:
dataset = pd.read_csv("output/text_rules_based_lv2.csv").fillna("unclassified")
dataset = dataset[["question", "cluster"]]
d = dataset[dataset.cluster=="unclassified"]
d.head()

Unnamed: 0,question,cluster
0,Are research fundings going to be cut as a res...,unclassified
1,Is it ok to go outside to get fresh air,unclassified
2,Should I star home and even avoid going to the...,unclassified
3,Are we going to have trouble buying milk,unclassified
4,Will Covid19 go away like the 1918 influenza a...,unclassified


In [235]:
args.n_topics = 15
args.dist_thresh = 0.6
model = utils.text.representation.LSI(args)
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 32 clusters


ahc_label
21     2
20     2
18     2
9      2
31     3
22     3
19     3
12     3
25     3
29     4
28     4
27     4
17     4
7      4
8      4
26     4
23     4
11     4
10     4
4      5
16     5
30     6
3      6
6      6
14     6
15     8
5     10
13    12
24    12
0     13
1     15
2     83
Name: question, dtype: int64

In [236]:
for label in d.ahc_label.unique():
    print(f"cluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(d[d.ahc_label==label][:10].question)

cluster #5, count - 10
0     Are research fundings going to be cut as a res...
1               Is it ok to go outside to get fresh air
2     Should I star home and even avoid going to the...
3              Are we going to have trouble buying milk
4     Will Covid19 go away like the 1918 influenza a...
6                        when covid 19 is going to stop
7                            When should I go to the ER
9                         Am I allowed to go for a walk
10                     When will it be ok to go outside
11    how much fever should i tolerate before going ...
Name: question, dtype: object
cluster #20, count - 2
5    Where should I go
8         how is going
Name: question, dtype: object
cluster #24, count - 12
12         Is there more than one strain of COVID 19
343    What do I do if someone with COVID 19 spotted
344                         COVID 19 stands for what
345                           Why is called COVID 19
346                     Where did COVID 19 come from

In [217]:
d.head()

Unnamed: 0,question,cluster,embedding,ahc_label
0,Are research fundings going to be cut as a res...,unclassified,"[0.03784225848917501, 0.14792442906499279, -0....",11
1,Is it ok to go outside to get fresh air,unclassified,"[0.10863800543379393, 0.19925573895788562, -0....",11
2,Should I star home and even avoid going to the...,unclassified,"[0.029156588250339985, 0.039139206493714655, -...",11
3,Are we going to have trouble buying milk,unclassified,"[0.03265145602538317, 0.028593934327626164, -0...",11
4,Will Covid19 go away like the 1918 influenza a...,unclassified,"[0.019492929504519453, 0.03203605628662124, -0...",11


In [231]:
d.groupby("ahc_label")["question"].count().describe()

count    71.000000
mean      3.521127
std       6.626482
min       1.000000
25%       1.000000
50%       2.000000
75%       4.000000
max      55.000000
Name: question, dtype: float64

In [141]:
# d.sort_values("ahc_label")[["question","ahc_label"]].to_csv(f"output/lv2_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [232]:
dataset = dataset.join(d["ahc_label"])

In [220]:
dataset.sort_values("ahc_label")[["question","ahc_label","cluster"]]

Unnamed: 0,question,ahc_label,cluster
133,Does Wearing gloves help,0.0,unclassified
308,Should I wear mask,0.0,unclassified
310,Why does the government think that we don t ne...,0.0,unclassified
307,Is it useful to wear a mask if we are not sick,0.0,unclassified
311,Does a mask actually help if I m not sick,0.0,unclassified
309,Do I need a mask,0.0,unclassified
93,When this virus terminated,1.0,unclassified
111,What is the difference between a bacteria and ...,1.0,unclassified
110,how can one fight the virus,1.0,unclassified
109,Can this virus infect me in an airborne fashion,1.0,unclassified


In [233]:
dataset.sort_values("ahc_label")[["question","ahc_label","cluster"]].to_csv(f"output/text_rules_based_plusplus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [177]:
dataset = pd.read_csv(f"output/text_rules_based_plus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx_manual.csv")

In [180]:
dataset["cluster"] = dataset.rule_based
dataset["cluster"][dataset.rule_based=="unclassified"] = dataset.ahc_manual_clutser_label

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [184]:
len(dataset[dataset.cluster.isna()])

250

In [185]:
# dataset.to_csv("output/text_rules_based_lv2.csv")

In [None]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(dataset.groupby("cluster")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [173]:
pd.DataFrame(list(dataset.groupby(["cluster","ahc_label"])["question"].apply(list))).T.count().to_frame("Count")

Unnamed: 0,Count
0,5
1,7
2,1
3,2
4,7
5,7
6,6
7,2
8,1
9,2
