# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
Date: 24 Mar, 2020 (Start)  

In [1]:
import re
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils


# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [2]:
class Args:
    def __init__(self):
        self.dataset = "data/train_set_covid.csv"
        self.embedding_file = "/home/designer1/Documents/sunanda/Data/common/embeddings/glove.6B.100d.txt"
        self.n_clusters = 100
        self.cluster_function = "normavg"
args=Args()

In [3]:
def read_embeddings_to_dict(embedding_file, limit = None, sep = " ", key = 0):
    ''' Read the word embeddings from the file and converts it into df - GloVe by default '''
    
    # Load the embeddings
    embeddings = pd.read_csv(embedding_file, sep=sep, index_col=key, header=None, nrows = limit, quoting=csv.QUOTE_NONE)
    
    # converting to dict where the word is the key and the word embedding array is the value
    embeddings = embeddings.apply(np.asarray, axis = 1).to_dict()
    
    return embeddings


## Dataset

In [38]:
dataset = pd.read_csv(args.dataset)

# Language Detection
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"
    
from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

dataset = dataset[dataset.detected_lang == "en"]

In [5]:
# Read embeddings to dictionary
if args.embedding_file:
    embeddings_dict = read_embeddings_to_dict(args.embedding_file)

## WcDe and AHC

In [58]:
dataset = pd.read_csv("output/text_rules_based_plusplus_15lsa_0.3dt_71ahcx.csv").fillna("unclassified")
dataset = dataset[["question", "cluster"]]
d = dataset[dataset.cluster=="unclassified"]
d.head()

Unnamed: 0,question,cluster
0,where can I help,unclassified
1,Will the heat of the summer help fighting the ...,unclassified
2,Does eating Oranges daily help to fight this v...,unclassified
3,Is there any medication to take to help,unclassified
4,How long are you contagious if you have Corona...,unclassified


In [75]:
d.question.values.tolist()

['where can I help',
 'Will the heat of the summer help fighting the virus',
 'Does eating Oranges daily help to fight this virus',
 'Is there any medication to take to help',
 'How long are you contagious if you have Coronavirus',
 'How do I treat coronavirus',
 'What have you implemented to help older people who do not know how to use the internet phone to obtain their groceries and medicine',
 'Are there any federal or provincial services to help senior citizens get grocery safely',
 'How can I differ between covid and normal flu',
 'My head hurts my body aches a little and I feel a cold coming How do I know if this is Covid 19 or just an ordinary cold',
 'Does a mask actually help if I m not sick',
 'Does Wearing gloves help',
 'Can my cat get the virus',
 'If I catch the virus will I be immune from getting it again',
 'Can I get the virus from a stray cat',
 'Can you get the virus twice',
 'What is the difference between a bacteria and a virus',
 'Can this virus infect me in an ai

In [65]:
args.n_clusters = 200
# Initialize WcDe
model = utils.text.representation.WcDe().initialize(
                                                        data = [dataset.question], 
                                                        n_clusters = args.n_clusters,
                                                        embeddings = embeddings_dict,
                                                        removeStopwords = False
                                                    )

1143 unique words found in the training set
1110 words from the training set found in the embeddings


In [66]:
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

0 new words found to be added


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [67]:
args.dist_thresh = 2.2
# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 31 clusters


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


ahc_label
15     1
28     1
27     1
26     1
23     1
19     1
18     1
17     1
21     2
0      2
8      2
29     3
30     4
3      4
24     4
25     5
12     6
16     6
6      6
22     7
4      9
10     9
9     10
14    10
7     12
13    12
20    17
5     18
1     26
11    33
2     35
Name: question, dtype: int64

In [68]:
d.groupby("ahc_label")["question"].count().describe()

count    31.000000
mean      8.064516
std       9.121168
min       1.000000
25%       1.500000
50%       5.000000
75%      10.000000
max      35.000000
Name: question, dtype: float64

In [69]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(d.groupby("ahc_label")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(d.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [70]:
visual

Unnamed: 0,cluster_#0,cluster_#1,cluster_#2,cluster_#3,cluster_#4,cluster_#5,cluster_#6,cluster_#7,cluster_#8,cluster_#9,...,cluster_#21,cluster_#22,cluster_#23,cluster_#24,cluster_#25,cluster_#26,cluster_#27,cluster_#28,cluster_#29,cluster_#30
Count,2,26,35,4,9,18,6,12,2,10,...,2,7,1,4,5,1,1,1,3,4
0,How about covid situations,If I catch the virus will I be immune from get...,Will the heat of the summer help fighting the ...,Has the virus evolved,How to avoid exposure,How can I differ between covid and normal flu,What is the advice surrounding outdoor exercis...,What is the difference between a bacteria and ...,What is the mortality rate,How severe is the virus,...,Impacts of it,When this virus terminated,Hypertension,how are you,Do I need to disinfect things I buy like groce...,What the size,Is ACE Inhibitor harmful,mental health,Is the virus airborne,When will we be back to work
1,Infomation about covid,How many will lose their jobs due to the chine...,Does a mask actually help if I m not sick,Corona virus characteristics,how to prevent,Do we see a future where this virus won t be s...,is the grocery open,How s the shortness of breath,Mortality rate today,How contagious is the virus,...,Impact of covid,where did it start,-,How are you,What should I use to disinfect grocery bags an...,-,-,-,Is Covid19 an airborne,who can you do
2,-,Should I star home and even avoid going to the...,Can my cat get the virus,Define covid 18,My brother has COVID 19 What to do,In this time of crisis is it okay to feed the ...,COVID 19 stands for what,Suffering from trout echiting,-,Is it transmissible through anal sex,...,-,How Covid 19 started,-,How are you,waht are thenoptiin to disinfect hands and sur...,-,-,-,Is COVID 19 air borne,When will it be ok to go outside
3,-,Are we going to have trouble buying milk,Can I get the virus from a stray cat,Is the phlegm color suggestive of coronavirus,Is it true that warm kills Coronavirus,Will there be any changes in PGWP,What COVID 19 stands for,What s the capital of Palestine,-,is it made by human or by nature,...,-,what do you know about COVID 19,-,how old are you,Should I use Sudafed,-,-,-,-,When will be done
4,-,how much fever should i tolerate before going ...,Can this virus infect me in an airborne fashion,-,Will high temperature kill coronavirus,Will it get better in summer with higher tempe...,Are vet clinics still open,What are the side effects of hydroxychloroquin...,-,What is the situation now in my country,...,-,How did this virus transmit,-,-,Is it safe to jog run outside,-,-,-,-,-
5,-,hat should I do if my 2 year old child starts ...,Will the virus affect my sexual ability,-,Is it true that warm kills Coronavirus,how can one fight the virus,is the liquor store open,How effective is the new drug manufactured by ...,-,What is the situation right now in the world,...,-,Is there more than one strain of COVID 19,-,-,-,-,-,-,-,-
6,-,I recently relocated to Quebec Can I request R...,If you recover from the virus can you still be...,-,Is it useful to wear a mask if we are not sick,In how many days can covid be cured,-,is there a new form of corona virus,-,Is the virus deadly,...,-,How much it s prevelent in Pakistan,-,-,-,-,-,-,-,-
7,-,When will the schools reopen,Are research fundings going to be cut as a res...,-,How do we best organize people so that duplica...,how to interact with people,-,Is ordering food from a restaurant discouraged,-,How deadly is the virus,...,-,-,-,-,-,-,-,-,-,-
8,-,im sneezing should i be worried,As of today can we gather only 2,-,could we have done better,Will the pandemic affect university research,-,What is the probability of dying,-,How deadly is the virus,...,-,-,-,-,-,-,-,-,-,-


In [48]:
for label in dataset.ahc_label.unique():
    print(f"cluster #{label}, count - {len(dataset[dataset.ahc_label==label])}")
    print(dataset[dataset.ahc_label==label][:10].question)

cluster #3, count - 41
0                                      where can I help
6     What have you implemented to help older people...
8         How can I differ between covid and normal flu
9     My head hurts my body aches a little and I fee...
21              Will the virus affect my sexual ability
30    Should I star home and even avoid going to the...
32    how much fever should i tolerate before going ...
38    hat should I do if my 2 year old child starts ...
41                                Why is Trudeau asleep
43    I recently relocated to Quebec Can I request R...
Name: question, dtype: object
cluster #6, count - 47
1     Will the heat of the summer help fighting the ...
16    What is the difference between a bacteria and ...
28    How long does the virus remain active and aliv...
29    Are research fundings going to be cut as a res...
33                        How s the shortness of breath
37    In this time of crisis is it okay to feed the ...
44                     is it

In [180]:
dataset["cluster"] = dataset.rule_based
dataset["cluster"][dataset.rule_based=="unclassified"] = dataset.ahc_manual_clutser_label

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [55]:
dataset.sort_values("ahc_label")[["question","ahc_label"]].to_csv(f"output/{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")
visual.to_csv(f"output/clusters_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [233]:
dataset.sort_values("ahc_label")[["question","ahc_label","cluster"]].to_csv(f"output/text_rules_based_plusplus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [177]:
dataset = pd.read_csv(f"output/text_rules_based_plus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx_manual.csv")

In [184]:
len(dataset[dataset.cluster.isna()])

250

In [185]:
# dataset.to_csv("output/text_rules_based_lv2.csv")

In [None]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(dataset.groupby("cluster")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [173]:
pd.DataFrame(list(dataset.groupby(["cluster","ahc_label"])["question"].apply(list))).T.count().to_frame("Count")

Unnamed: 0,Count
0,5
1,7
2,1
3,2
4,7
5,7
6,6
7,2
8,1
9,2
