# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
Date: 24 Mar, 2020 (Start)  

In [4]:
import re
import regex
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [100]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        # self.filename = "query_result_2020-03-27T19_12_30.866993Z.csv"
        self.filename = "data_dump_2020-04-14.csv"
        self.dataset = f"data/{self.filename}"        
        self.suffix = "_".join([word for word in self.filename.split("_") if not word.isalpha()])[:-4]
        self.vector_mode = "tfidf"
        self.n_topics = 230
        self.dist_thresh = 1.5
        self.lang = "en"
args=Args()

In [101]:
new_labels = {
                "about": "covid-what",
                "animals": "covid-animals",
                "caution": "personal-caution",
                "dangerisk": "covid-contagious",
                "diff": "covid-versus",
                "future": "situation-future",
                "guideme": "personal-whatif",
                "incubation": "covid-incubation",
                "infection": "covid-infection",
                "isolation": "personal-isolation",
                "lockdown": "situation-lockdown",
                "nextsteps": "personal-symptoms",
                "past": "situation-past",
                "recover": "covid-recovery",
                "statistics": "situation-stats",
                "symptom": "covid-symptoms",
                "test": "personal-testing",
                "transmission": "covid-transmission",
                "treatment": "covid-med",
                "unclassified": "unclassified",
                "virusfight": "covid-fight",
                "viruskill": "covid-kill",
                "viruslife": "covid-life"
            }

In [102]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

In [103]:
def surrounding(word,area=2):
    return

def fuzzy_match(word,pattern):
    if regex.search(pattern, word, re.IGNORECASE):
        return True
    else:
        return False    

In [104]:
import re
import nltk
import unidecode

alpha_regex = re.compile('[^a-zA-Z]')

from nltk.corpus import stopwords as sw
if args.lang == "en": stopwords = sw.words('english')
if args.lang == "fr": stopwords = sw.words('french')

from nltk.stem.snowball import SnowballStemmer
if args.lang == "en": stemmer = SnowballStemmer("english")
if args.lang == "fr": stemmer = SnowballStemmer("french")

def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            word = unidecode.unidecode(word)
            clean_words = alpha_regex.sub(' ', word).split()
            tokens.extend([word.lower() for word in clean_words])
    return tokens

def stem(word):
    return stemmer.stem(word).strip()

def preprocess(text):    
    tokenized = tokenize(text)
    cleaned = [word for word in tokenized if word not in stopwords and word is not '']
#     stemed = [stem(word) for word in cleaned]
    #stemed = [stem(word) for word in tokenized]
    #corpus[i] = ' '.join(tokenized)
    return ' '.join(tokenized)

## Dataset

In [105]:
# Will add later, right now, leaving it to the utils doc2vec

In [106]:
dataset = pd.read_csv(f"data/{args.filename}")
print(f"Dataset has {len(dataset)} documents")

# Language Detection
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"   

from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

dataset = dataset[dataset.detected_lang == args.lang]

dataset["text"] = dataset.question.apply(preprocess)
print(f"Dataset has {len(dataset)} english documents")

Dataset has 5005 documents
Dataset has 2007 english documents


In [107]:
# dataset[(dataset.language != "en") & (dataset.detected_lang == "en")]

In [108]:
# dataset[(dataset.language == "en") & (dataset.detected_lang != "en")]

## Rules
Note: The order of these rules matters in resolving conflicts

In [109]:
# Statistics
dataset["statistics"] = (
                        dataset.text.str.contains("cases",case=False)|
                        dataset.text.str.contains("death",case=False)|
                        dataset.text.str.contains("died",case=False)|
                        dataset.text.str.contains("mortality rate",case=False)|
                        dataset.text.str.contains("death rate",case=False)|
                        dataset.text.str.contains("deadly",case=False)|
                        dataset.text.str.contains("statistic",case=False)|
                        (
                            dataset.text.str.contains("how",case=False)&
                            dataset.text.str.contains("many",case=False)&
                            dataset.text.str.contains("people",case=False)
                        )
                    ).apply(int)

In [110]:
dataset["animals"] = (
                        dataset.text.str.contains(r"\b(?:animal|bird|cat|dog)s?\b",case=False)
                    ).apply(int)

In [111]:
dataset["caution"] = (
                        dataset.text.str.contains("prevent",case=False)|
                        dataset.text.str.contains("protect",case=False)|
                        dataset.text.str.contains("precaution",case=False)|
                        dataset.text.str.contains("safety",case=False)|
                        (
                            dataset.text.str.contains("keep",case=False)&
                            dataset.text.str.contains("safe",case=False)
                        )
                    ).apply(int)

In [112]:
dataset["viruslife"] = (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            (
                                dataset.text.str.contains("live|stay|survive",case=False)
                            )&
                            (
                                dataset.text.str.contains("on",case=False)
                            )
                    ).apply(int)

In [113]:
dataset["viruskill"] = (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            (
                                dataset.text.str.contains("kills",case=False)
                            )
                    ).apply(int)

In [114]:
dataset["virusfight"] = (
                            (
                                (
                                    dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                    dataset.text.str.contains("corona",case=False)|
                                    dataset.text.str.contains("virus",case=False)
                                )&
                                (
                                    dataset.text.str.contains("fight",case=False)
                                )&
                                (
                                    dataset.text.str.contains("help",case=False)
                                )
                            )|
                            (
                                dataset.text.str.contains("mask",case=False)|
                                dataset.text.str.contains("glove",case=False)
                            )
                        ).apply(int)

In [115]:
dataset["treatment"] = (
                        dataset.text.str.contains("treatment",case=False)|
                        dataset.text.str.contains("cure",case=False)|
                        dataset.text.str.contains("vaccine",case=False)|
                        dataset.text.str.contains("medic",case=False)
                    ).apply(int)

In [116]:
dataset["incubation"] = (
                        dataset.text.str.contains("incubate",case=False)|
                        dataset.text.str.contains("incubation",case=False)
                    ).apply(int)

In [117]:
dataset["nextsteps"] = (
                        dataset.text.str.contains("i have",case=False) 
                    ).apply(int)

In [118]:
dataset["guideme"] = (
                        dataset.text.str.contains("if",case=False)
                    ).apply(int)

In [119]:
# dataset["hospital"] = (
# #                         dataset.text.str.contains(r"\bgo\b",case=False)&
#                         (
#                             dataset.text.str.contains("hospital",case=False)|                            
#                             dataset.text.str.contains(r"\bER\b",case=False)
#                         )
#                     ).apply(int)

In [120]:
# Dos and Donts
dataset["lockdown"] = (
                        (
                            (
                                dataset.text.str.contains("go (?:on|to|for|out)",case=False)|
                                dataset.text.str.contains("walk",case=False)
                            )&
                            (
                                dataset.text.str.contains("allow",case=False)|
                                dataset.text.str.contains("can",case=False)|
                                dataset.text.str.contains("ok|okay",case=False)|
                                dataset.text.str.contains("should|shall",case=False)
                            )
                        )|
                        (
                            dataset.text.str.contains("lockdown",case=False)|
                            dataset.text.str.contains(r"\bopen\b",case=False)|
                            dataset.text.str.contains(r"\bclose",case=False)
                        )
                    ).apply(int)

In [121]:
dataset["infection"] = (
                        dataset.text.str.contains("infected",case=False)|
                        dataset.text.str.contains("infection",case=False)
                    ).apply(int)

In [122]:
dataset["diff"] = (
                        dataset.text.str.contains("diff",case=False)|
                        dataset.text.apply(fuzzy_match, pattern="(?:distinguish){e<=3}")
                    ).apply(int)

In [123]:
dataset["recover"] = (
                        dataset.text.str.contains("recover",case=False)
                    ).apply(int)

In [124]:
dataset["test"] = (
                        dataset.text.str.contains("tested",case=False)|
                        dataset.text.str.contains("test",case=False)
                    ).apply(int)

In [125]:
dataset["isolation"] = (
                            dataset.text.str.contains(r"\bisolat",case=False)|
                            dataset.text.str.contains(r"\bsocial dist",case=False)
                        ).apply(int)

In [126]:
dataset["dangerisk"] = (
                            dataset.text.str.contains("dangerous",case=False)|
                            dataset.text.str.contains("risk",case=False)
                        ).apply(int)

In [127]:
dataset["transmission"] = (
                            dataset.text.str.contains("transmi",case=False)|
                            dataset.text.str.contains("contract",case=False)|
                            dataset.text.str.contains("spread",case=False)|
                            dataset.text.apply(fuzzy_match, pattern="(?:airborne){e<=3}")
                        ).apply(int)

In [128]:
# Fuzzy Matching of 'Symptom' keyword (accounting for spelling errors)
dataset["symptom"] = (
                        dataset.text.apply(fuzzy_match, pattern="(?:symptom){1<=e<=3}")
                    ).apply(int)

In [129]:
dataset["about"] = (
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:whats|what (?:is|s))")
                        ) & 
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                            dataset.text.str.contains("corona",case=False)
                        )
                    ).apply(int)

In [130]:
dataset["future"] = (
                        (
                            (
                                dataset.text.str.contains("how",case=False) &
                                dataset.text.str.contains("long",case=False)
                            )|
                            dataset.text.str.contains("when",case=False)
                        )&
                            dataset.text.str.contains("will",case=False)&
                        (
                            dataset.text.str.contains("last|end|over|normal|done",case=False)
                        )
                    ).apply(int)

In [131]:
dataset["past"] = (
                        (
                            dataset.text.str.contains("how|when|where",case=False) 
                        )&
                            dataset.text.str.contains("did",case=False)&
                        (
                            dataset.text.str.contains("start|begin|began",case=False)
                        )
                    ).apply(int)

In [132]:
dataset = dataset.rename(columns=new_labels)

In [133]:
features = [col_name for col_name in dataset.columns.values.tolist() if "-" in col_name]

In [134]:
features

['situation-stats',
 'covid-animals',
 'personal-caution',
 'covid-life',
 'covid-kill',
 'covid-fight',
 'covid-med',
 'covid-incubation',
 'personal-symptoms',
 'personal-whatif',
 'situation-lockdown',
 'covid-infection',
 'covid-versus',
 'covid-recovery',
 'personal-testing',
 'personal-isolation',
 'covid-contagious',
 'covid-transmission',
 'covid-symptoms',
 'covid-what',
 'situation-future',
 'situation-past']

In [135]:
dataset["total"] = dataset[features].sum(axis=1)

In [136]:
dataset.groupby("total").describe()

Unnamed: 0_level_0,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,covid-animals,covid-animals,...,situation-future,situation-future,situation-past,situation-past,situation-past,situation-past,situation-past,situation-past,situation-past,situation-past
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
total,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,549.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,549.0,0.0,...,0.0,0.0,549.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1058.0,0.07845,0.269005,0.0,0.0,0.0,0.0,1.0,1058.0,0.009452,...,0.0,1.0,1058.0,0.003781,0.0614,0.0,0.0,0.0,0.0,1.0
2,289.0,0.058824,0.235702,0.0,0.0,0.0,0.0,1.0,289.0,0.044983,...,0.0,1.0,289.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,80.0,0.025,0.15711,0.0,0.0,0.0,0.0,1.0,80.0,0.0375,...,0.0,0.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21.0,0.047619,0.218218,0.0,0.0,0.0,0.0,1.0,21.0,0.047619,...,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.285714,...,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0


In [137]:
dataset["cluster"] = "unclassified"

# For single features
for col in features:
    dataset["cluster"][(dataset.total == 1) & (dataset[col] == True)] = col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [138]:
for col in features[::-1]:
    dataset["cluster"][(dataset.total > 1) & (dataset[col] == True)] = col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [139]:
path = f"output/simple_{args.suffix}_{args.lang}.csv"
dataset.drop(features, axis="columns").drop(["text","total"], axis="columns").to_csv(path)
print(f"Rules based output saved to {path}")

Rules based output saved to output/simple_2020-04-14_en.csv


In [140]:
dataset.groupby("cluster")["question"].count()

cluster
covid-animals          29
covid-contagious       47
covid-fight            43
covid-incubation       16
covid-infection        66
covid-kill              6
covid-life             56
covid-med              58
covid-recovery         15
covid-symptoms        227
covid-transmission     55
covid-versus            3
covid-what            106
personal-caution       55
personal-isolation     39
personal-symptoms     159
personal-testing      103
personal-whatif       187
situation-future       19
situation-lockdown     62
situation-past          4
situation-stats       103
unclassified          549
Name: question, dtype: int64

### Length analysis for situations

## LSA and AHC

In [547]:
d = dataset[dataset.cluster=="unclassified"][["question", "cluster"]]

In [548]:
import nltk
stopwords_list = nltk.corpus.stopwords.words('english')

In [549]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [nltk.word_tokenize(text) for text in texts]

In [550]:
args.n_topics = 15
args.dist_thresh = 0.7
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 43 clusters


ahc_label
31     2
21     3
36     3
41     3
40     4
38     4
34     4
29     4
23     4
17     4
39     5
42     5
4      6
20     6
3      7
16     7
19     7
30     7
26     7
0      8
15     8
5      8
28     8
35     9
27     9
12    10
8     10
24    10
14    11
37    13
33    13
18    14
9     15
13    18
25    18
11    19
1     21
6     22
7     22
2     23
10    26
22    33
32    36
Name: question, dtype: int64

In [551]:
d.groupby("ahc_label")["question"].count().describe()

count    43.000000
mean     11.069767
std       8.209737
min       2.000000
25%       5.000000
50%       8.000000
75%      14.500000
max      36.000000
Name: question, dtype: float64

In [552]:
# for label in d.ahc_label.unique():
#     print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
#     print(d[d.ahc_label==label][:10].question.tolist())

In [553]:
dataset = dataset.join(d["ahc_label"])

In [554]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}.csv"
dataset.to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

AHC on top of rule based output saved to output/simpleLsa_15n0.7dt_2020-03-27T19_12_30.866993Z.csv


## Iterative Clustering