# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
Date: 24 Mar, 2020 (Start)  

In [1]:
import re
import os
import csv
import sys
import json
import time
import nltk
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils


# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [4]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        self.dataset = "data/train_set_covid.csv"
        self.vector_mode = "tfidf"
        self.n_topics = 230
        self.dist_thresh = 1.5
        self.lang = "fr"
args=Args()

In [58]:
import re
regex = re.compile('[^a-zA-Z]')

from nltk.corpus import stopwords as sw
if args.lang == "en": stopwords = sw.words('english')
if args.lang == "fr": stopwords = sw.words('french')

from nltk.stem.snowball import SnowballStemmer
if args.lang == "en": stemmer = SnowballStemmer("english")
if args.lang == "fr": stemmer = SnowballStemmer("english")

def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            clean_words = regex.sub(' ', word).split()
            tokens.extend([word.lower() for word in clean_words])
    return tokens


def stem(word):
    return stemmer.stem(word).strip()

def preprocess(text):    
    tokenized = tokenize(text)
    cleaned = [word for word in tokenized if word not in stopwords and word is not '']
#     stemed = [stem(word) for word in cleaned]
    #stemed = [stem(word) for word in tokenized]
    #corpus[i] = ' '.join(tokenized)
    return ' '.join(tokenized)

## Dataset

In [3]:
dataset = pd.read_csv(args.dataset)

# Language Detection
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"   

from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

dataset = dataset[dataset.detected_lang == args.lang]

dataset["cleaned_question"] = dataset.question.apply(preprocess)

## LSI and AHC

In [44]:
args.n_topics = 230

In [50]:
args.dist_thresh = 3
model = utils.text.representation.LSI(args)
dataset["embedding"] = model.generate_embedding(dataset.question, returnarray=False)

# Cluster
X = pd.DataFrame(dataset["embedding"].values.tolist(), index= dataset.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
dataset["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(dataset["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
dataset.groupby("ahc_label")["question"].count().sort_values()

Found 14 clusters


ahc_label
10      9
11      9
7      18
13     20
3      22
2      23
6      25
8      25
1      27
4      27
5      27
9      29
12     30
0     397
Name: question, dtype: int64

In [51]:
dataset.groupby("ahc_label")["question"].count().describe()

count     14.000000
mean      49.142857
std      100.336248
min        9.000000
25%       20.500000
50%       25.000000
75%       27.000000
max      397.000000
Name: question, dtype: float64

In [52]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [53]:
visual

Unnamed: 0,cluster_#0,cluster_#1,cluster_#2,cluster_#3,cluster_#4,cluster_#5,cluster_#6,cluster_#7,cluster_#8,cluster_#9,cluster_#10,cluster_#11,cluster_#12,cluster_#13
Count,397,27,23,22,27,27,25,18,25,29,9,9,30,20
0,Are you sick?,what are the symptoms,I think I have a fever. What do I do?,What's covid19?,Is there a vaccine for corona virus?,How many confirmed cases in Ontario now?,What is coronavirus ?,How do I know if I haver coronavirus?,How long before symptoms start to show?,Which country infected ?,What is the incubation time of the virus?,Where should I be tested ?,what is covid-19,How many people have dies
1,When should I go to the hospital?,what are the symptoms of COVID-19?,How's the shortness of breath?,covid19 symptons?,what is the death rate of the new corona virus,How many cases in Quebec?,How long are you contagious if you have Corona...,may I know what is corona?,How long does it last?,How many infected person in south korea?,Is the virus infectious during incubation period?,Where can I get tested?,What is Covid-19?,who is at risk of dying after contracting the ...
2,Does the virus spread by rain?,What are the symptoms,I have fever what I should do,Will Covid19 go away like the 1918 influenza a...,Symptoms of corona,How many cases in Montreal?,How do I treat coronavirus?,what do you know about COVID-19?,How long does it last once infected ?,I have cough. Does that mean I might be infected?,What is the incubation time of covid-19 by age,where can I test for covid,What is the covid?,how many people positive result with COVID-19
3,What are the synthoms?,what are the symptoms,I have cold for couple of weeks. Have cough ov...,What is Covid19,"Hello, what happens after corona?",how many cases are there in Washington state,How do people get infected with Coronavirus?,I'd love to know how long I can be contagious?,How long am I infectious for,What is the current situation of COVID-19 infe...,What is the incubation period for COVID-19?,"I would like to get COVID19 test, where can I ...",What is covid,how to interact with people
4,What mesures needs to take a pregnant women to...,What are the symptoms,Is fever the main symptom for vivid-19,Can drinking alcohol helps COVID19?,Whats corona,How many cases in Montreal,Is it true that warm kills Coronavirus?,What have you implemented to help older people...,How long until an infected person shows symptoms?,How many people are infected in Lithuania?,What’s the incubation period?,Where do we get the test for COVID-19?,What is the COVID-19?,how many people died?
5,Do you think Taiwan is good?,What are the symptoms,What if I have cough ?,How can I make the difference between a cold a...,how much does corona virus stay on each kind o...,how many deaths are in Italy as of March 21,How does coronavirus propagate?,How do I know if I have the corona virus,How long can the covid-19 survive outside the ...,Am I infected,What is the period of incubation?,Were can I get tested?,what's is COVID-19?,What is the probability of dying?
6,Is the Asymptomatic Carrier a temporary status?,Symptoms of Covid,Must the fever-symptom related to COVID-19 be ...,What are the symptoms of the covid19?,What are the symptoms of Corona?,How many days can you be contagious without sh...,Is it true that warm kills Coronavirus?,How do I know if my chest is hurt because lung...,How long is the treatment?,What do I do if I suspect I'm infected?,What is the incubation period?,"if i think i have it, should i got get tested",Covid-19,how many people in America?
7,What is it,What are the symptoms of Covid-19?,Is a cough but no fever a symptom?,What's covid19?,Why is it called Corona virus?,How many case in montreal,"I think I have coronavirus, should I go to the...",How do I know if I have the flu or coronavirus,How long does it take to recover?,how can I prevent to get infected?,How long is incubation period?,where can I get tested,What is covid?,How many people have it in my town?
8,How to avoid exposure?,Symptoms,I am coughing but I don't have fever. Could I ...,Can you explain to me what is the covid19?,What shall I do if I have to go out to buy thi...,How many thing you do not understand,Will high temperature kill coronavirus?,How do I know if I have the covid-19,How long will it last?,Which country infected ?,How long does this virus incubate?,How do I know when to get tested?,What is covid 19?,How does it die in asymptomatic people?


In [55]:
dataset.sort_values("ahc_label")[["question","ahc_label"]].to_csv(f"output/{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")
visual.to_csv(f"output/clusters_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

# LEVEL 2

In [228]:
dataset = pd.read_csv("output/text_rules_based_lv2.csv").fillna("unclassified")
dataset = dataset[["question", "cluster"]]
d = dataset[dataset.cluster=="unclassified"]
d.head()

Unnamed: 0,question,cluster
0,Are research fundings going to be cut as a res...,unclassified
1,Is it ok to go outside to get fresh air,unclassified
2,Should I star home and even avoid going to the...,unclassified
3,Are we going to have trouble buying milk,unclassified
4,Will Covid19 go away like the 1918 influenza a...,unclassified


In [235]:
args.n_topics = 15
args.dist_thresh = 0.6
model = utils.text.representation.LSI(args)
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 32 clusters


ahc_label
21     2
20     2
18     2
9      2
31     3
22     3
19     3
12     3
25     3
29     4
28     4
27     4
17     4
7      4
8      4
26     4
23     4
11     4
10     4
4      5
16     5
30     6
3      6
6      6
14     6
15     8
5     10
13    12
24    12
0     13
1     15
2     83
Name: question, dtype: int64

In [236]:
for label in d.ahc_label.unique():
    print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(d[d.ahc_label==label][:10].question)

cluster #5, count - 10
0     Are research fundings going to be cut as a res...
1               Is it ok to go outside to get fresh air
2     Should I star home and even avoid going to the...
3              Are we going to have trouble buying milk
4     Will Covid19 go away like the 1918 influenza a...
6                        when covid 19 is going to stop
7                            When should I go to the ER
9                         Am I allowed to go for a walk
10                     When will it be ok to go outside
11    how much fever should i tolerate before going ...
Name: question, dtype: object
cluster #20, count - 2
5    Where should I go
8         how is going
Name: question, dtype: object
cluster #24, count - 12
12         Is there more than one strain of COVID 19
343    What do I do if someone with COVID 19 spotted
344                         COVID 19 stands for what
345                           Why is called COVID 19
346                     Where did COVID 19 come from

In [217]:
d.head()

Unnamed: 0,question,cluster,embedding,ahc_label
0,Are research fundings going to be cut as a res...,unclassified,"[0.03784225848917501, 0.14792442906499279, -0....",11
1,Is it ok to go outside to get fresh air,unclassified,"[0.10863800543379393, 0.19925573895788562, -0....",11
2,Should I star home and even avoid going to the...,unclassified,"[0.029156588250339985, 0.039139206493714655, -...",11
3,Are we going to have trouble buying milk,unclassified,"[0.03265145602538317, 0.028593934327626164, -0...",11
4,Will Covid19 go away like the 1918 influenza a...,unclassified,"[0.019492929504519453, 0.03203605628662124, -0...",11


In [231]:
d.groupby("ahc_label")["question"].count().describe()

count    71.000000
mean      3.521127
std       6.626482
min       1.000000
25%       1.000000
50%       2.000000
75%       4.000000
max      55.000000
Name: question, dtype: float64

In [141]:
# d.sort_values("ahc_label")[["question","ahc_label"]].to_csv(f"output/lv2_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [232]:
dataset = dataset.join(d["ahc_label"])

In [220]:
dataset.sort_values("ahc_label")[["question","ahc_label","cluster"]]

Unnamed: 0,question,ahc_label,cluster
133,Does Wearing gloves help,0.0,unclassified
308,Should I wear mask,0.0,unclassified
310,Why does the government think that we don t ne...,0.0,unclassified
307,Is it useful to wear a mask if we are not sick,0.0,unclassified
311,Does a mask actually help if I m not sick,0.0,unclassified
309,Do I need a mask,0.0,unclassified
93,When this virus terminated,1.0,unclassified
111,What is the difference between a bacteria and ...,1.0,unclassified
110,how can one fight the virus,1.0,unclassified
109,Can this virus infect me in an airborne fashion,1.0,unclassified


In [233]:
dataset.sort_values("ahc_label")[["question","ahc_label","cluster"]].to_csv(f"output/text_rules_based_plusplus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx.csv")

In [177]:
dataset = pd.read_csv(f"output/text_rules_based_plus_{args.n_topics}lsa_{args.dist_thresh}dt_{args.n_clusters}ahcx_manual.csv")

In [180]:
dataset["cluster"] = dataset.rule_based
dataset["cluster"][dataset.rule_based=="unclassified"] = dataset.ahc_manual_clutser_label

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [184]:
len(dataset[dataset.cluster.isna()])

250

In [185]:
# dataset.to_csv("output/text_rules_based_lv2.csv")

In [None]:
# Generating visualization csv for clusters
counts = pd.DataFrame(list(dataset.groupby("cluster")["question"].apply(list))).T.count().to_frame("Count")
visual = counts.T.append(pd.DataFrame(list(dataset.groupby("ahc_label")["question"].apply(list))).T.fillna("-")).add_prefix("cluster_#")

In [173]:
pd.DataFrame(list(dataset.groupby(["cluster","ahc_label"])["question"].apply(list))).T.count().to_frame("Count")

Unnamed: 0,Count
0,5
1,7
2,1
3,2
4,7
5,7
6,6
7,2
8,1
9,2
