# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is samel.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
License: CC BY-NC   
Date: 24 Mar, 2020 (Start)  

In [83]:
import re
import regex
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [84]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        self.dataset = "data/query_result_2020-03-27T19_12_30.866993Z.csv"
        self.vector_size = 100
        self.window = 5
        self.max_vocab_size=None
        self.min_count = 5
        self.repr_method = "doc2vec"
args=Args()

In [85]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

## Reading Dataset

In [86]:
dataset = pd.read_csv(args.dataset)

In [87]:
len(dataset)

4294

## Preprocess

In [128]:
# Will add later, right now, leaving it to the utils doc2vec

In [89]:
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"

In [90]:
from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

In [91]:
dataset = dataset[dataset.detected_lang == "en"]

In [92]:
def preprocess(text):
    return " ".join(re.findall(r"[^\W]+",text,re.IGNORECASE))

dataset.question = dataset.question.apply(preprocess)

In [93]:
len(dataset)

1673

In [94]:
def surrounding(word,area=2):
    return

def fuzzy_match(word,pattern):
    if regex.search(pattern, word, re.IGNORECASE):
        return True
    else:
        return False    

In [95]:
dataset["symptom"] = (
                        dataset.question.apply(fuzzy_match, pattern="(?:symptom){1<=e<=3}")
                    ).apply(int)

In [96]:
# Statistics
dataset["statistics"] = (
                        dataset.question.str.contains("cases",case=False)|
                        dataset.question.str.contains("death",case=False)|
                        dataset.question.str.contains("died",case=False)
                    ).apply(int)

In [97]:
# Dos and Donts
dataset["can"] = (
                        dataset.question.str.contains(r"\bcan\b",case=False)&
                        dataset.question.str.contains("go (?:to|for|out)",case=False)
                    ).apply(int)

  after removing the cwd from sys.path.


In [98]:
dataset["hospital"] = (
                        dataset.question.str.contains(r"\bgo\b",case=False)&
                        (
                            dataset.question.str.contains("hospital",case=False)|                            
                            dataset.question.str.contains(r"\bER\b",case=False)
                        )
                    ).apply(int)

In [99]:
dataset["isolation"] = (
                        dataset.question.str.contains(r"\bisolat",case=False)|
                        dataset.question.str.contains(r"\bsocial dist",case=False)
                    ).apply(int)

In [100]:
dataset["about"] = (
                        (
                            dataset.question.apply(fuzzy_match, pattern="(?:whats|what (?:is|s))")
                        ) & 
                        (
                            dataset.question.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                            dataset.question.str.contains("corona",case=False)
                        )
                    ).apply(int)

In [101]:
dataset["incubation"] = (
                        dataset.question.str.contains("incubate",case=False)|
                        dataset.question.str.contains("incubation",case=False)
                    ).apply(int)

In [102]:
dataset["infection"] = (
                        dataset.question.str.contains("infected",case=False)|
                        dataset.question.str.contains("infection",case=False)
                    ).apply(int)

In [103]:
dataset["caution"] = (
                        dataset.question.str.contains("prevent",case=False)|
                        dataset.question.str.contains("protect",case=False)
                    ).apply(int)

In [104]:
dataset["treatment"] = (
                        dataset.question.str.contains("treatment",case=False)|
                        dataset.question.str.contains("cure",case=False)|
                        dataset.question.str.contains("vaccine",case=False)|
                        dataset.question.str.contains("medicine",case=False)
                    ).apply(int)

In [105]:
dataset["future"] = (
                        (
                            dataset.question.str.contains("how",case=False) &
                            dataset.question.str.contains("long",case=False) &
                            dataset.question.str.contains("last",case=False)
                        )|
                        (
                            dataset.question.str.contains("when",case=False) &
                            dataset.question.str.contains("will",case=False) &
                            dataset.question.str.contains("end",case=False)
                        )
                    ).apply(int)

In [106]:
dataset["nextsteps"] = (
                        dataset.question.str.contains("i have",case=False) 
                    ).apply(int)

In [107]:
dataset["test"] = (
                        dataset.question.str.contains("tested",case=False)|
                        dataset.question.str.contains("test",case=False)
                    ).apply(int)

In [108]:
dataset["open"] = (
                        dataset.question.str.contains(r"\bopen\b",case=False)
                    ).apply(int)

In [109]:
features = [f for f in dataset.columns[5:].values.tolist() if f not in ["total","cluster"]]

In [110]:
features

['symptom',
 'statistics',
 'can',
 'hospital',
 'isolation',
 'about',
 'incubation',
 'infection',
 'caution',
 'treatment',
 'future',
 'nextsteps',
 'test',
 'open']

In [117]:
# ['symptom',
#  'statistics',
#  'can',
#  'hospital',
#  'isolation',
#  'about',
#  'incubation',
#  'infection',
#  'caution',
#  'treatment',
#  'future',
#  'nextsteps',
#  'test',
#  'open']
# Resolve Multiple Features
order = [
            'statistics', 
            'open', 
            'caution',
            'treatment',
            'incubation', 
            'nextsteps', 
            'hospital', 
            'can', 
            'infection', 
            'test', 
            'isolation', 
            'symptom', 
            'about',
            'future'
        ][::-1]

In [118]:
dataset["total"] = dataset[features].sum(axis=1)

In [119]:
dataset.groupby("total").describe()

Unnamed: 0_level_0,symptom,symptom,symptom,symptom,symptom,symptom,symptom,symptom,statistics,statistics,...,test,test,open,open,open,open,open,open,open,open
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
total,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,780.0,0.0,...,0.0,0.0,780.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,773.0,0.28978,0.453954,0.0,0.0,0.0,1.0,1.0,773.0,0.07762,...,0.0,1.0,773.0,0.007762,0.087816,0.0,0.0,0.0,0.0,1.0
2,103.0,0.475728,0.501853,0.0,0.0,0.0,1.0,1.0,103.0,0.058252,...,0.0,1.0,103.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15.0,0.8,0.414039,0.0,1.0,1.0,1.0,1.0,15.0,0.0,...,1.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0


In [120]:
dataset["cluster"] = None

# For single features
for col in features:
    dataset["cluster"][(dataset.total == 1) & (dataset[col] == True)] = col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [121]:
dataset[dataset.total==2]

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,symptom,statistics,can,hospital,isolation,...,incubation,infection,caution,treatment,future,nextsteps,test,open,total,cluster
15,2020-03-19T14:40:42.773Z,4c2c5c1b-2327-45d5-8c63-901cac28ea30,en,Can I go for a run Does running exercise compr...,en,1,0,1,0,0,...,0,0,0,0,0,0,0,0,2,
16,2020-03-19T14:17:16.25Z,6d60c897-dd96-483d-b528-b65fcae4c1b0,en,If I think I have symptoms of COVID19 should I...,en,1,0,0,0,0,...,0,0,0,0,0,1,0,0,2,
28,2020-03-20T10:06:50.556Z,267d9215-d7a2-49c7-b778-f9a0a0cbbdb2,en,if i think i have it should i got get tested,en,0,0,0,0,0,...,0,0,0,0,0,1,1,0,2,
90,2020-03-21T09:45:35.493Z,fdb9ea21-a2c4-4562-b182-b28ff01f256b,en,How can I get tested if I m asymptomatic,en,1,0,0,0,0,...,0,0,0,0,0,0,1,0,2,
129,2020-03-21T16:15:49.262Z,3eb07cc6-e9a7-40d0-9609-553a07823637,en,How do I know if I have been infected,en,0,0,0,0,0,...,0,1,0,0,0,1,0,0,2,
155,2020-03-21T16:15:49.283Z,3eb07cc6-e9a7-40d0-9609-553a07823637,en,Should I go to the ER or my physician directly...,en,0,0,0,1,0,...,0,1,0,0,0,0,0,0,2,
156,2020-03-21T16:36:51.5Z,6147e278-3c26-40e4-a016-b18f494c03a3,en,what are the symptoms of covid 19 by day of in...,en,1,0,0,0,0,...,0,1,0,0,0,0,0,0,2,
208,2020-03-21T14:45:18.753Z,22acb4b7-fe15-46a8-bff8-109707707762,en,I would like clarifications about the fact tha...,en,1,0,0,0,0,...,0,0,0,0,0,0,1,0,2,
226,2020-03-21T16:36:06.691Z,107eeb57-8b1f-481e-aff5-8ce87b1cea8a,en,How long will social distancing measures last,en,0,0,0,0,1,...,0,0,0,0,1,0,0,0,2,
270,2020-03-21T13:56:49.03Z,9ad0060c-5628-4e8c-ba12-a7167213f27e,en,My boyfriend experienced gastro intestinal sym...,en,1,0,0,0,0,...,0,0,0,0,0,0,1,0,2,


In [122]:
for col in order:
    dataset["cluster"][(dataset.total > 1) & (dataset[col] == True)] = col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [123]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,symptom,statistics,can,hospital,isolation,...,incubation,infection,caution,treatment,future,nextsteps,test,open,total,cluster
0,2020-03-18T17:57:10.427Z,7302144e-abbe-4fa4-aaf1-cf5eb674a862,en,What is Dialogue s position on sick notes,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,2020-03-19T11:16:11.354Z,5a65a4bd-05aa-4ca8-b6da-5f807bde3294,en,What are Dialogue services,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,2020-03-19T14:17:16.246Z,6d60c897-dd96-483d-b528-b65fcae4c1b0,en,Should I practice social distancing,en,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,isolation
4,2020-03-19T14:16:05.351Z,f264fd4b-de03-4dc0-8531-fa9d0a1b0633,en,I d love to know how long I can be contagious,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
7,2020-03-19T13:40:44.882Z,f75ca552-9cf8-4f52-8151-25c0cea0a8b0,en,Get I get infected from a dog,en,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,infection


In [124]:
dataset[dataset.total==0][:30]

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,symptom,statistics,can,hospital,isolation,...,incubation,infection,caution,treatment,future,nextsteps,test,open,total,cluster
0,2020-03-18T17:57:10.427Z,7302144e-abbe-4fa4-aaf1-cf5eb674a862,en,What is Dialogue s position on sick notes,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,2020-03-19T11:16:11.354Z,5a65a4bd-05aa-4ca8-b6da-5f807bde3294,en,What are Dialogue services,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
4,2020-03-19T14:16:05.351Z,f264fd4b-de03-4dc0-8531-fa9d0a1b0633,en,I d love to know how long I can be contagious,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
8,2020-03-19T23:02:18.034Z,58cd9504-720d-44cd-89a2-4cc83cde7b90,en,Someone from quarantaine came to my office and...,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
18,2020-03-19T14:06:17.831Z,f75ca552-9cf8-4f52-8151-25c0cea0a8b0,en,I had an asthma attack when I was young but di...,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
19,2020-03-19T17:59:25.201Z,ce80a1dc-7cbd-4922-8b00-b491a17da62e,en,mental health,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
31,2020-03-20T17:02:53.112Z,a061e1ea-7076-45c7-b063-fe37c9207e8f,en,is covid 19 dangerous,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
36,2020-03-20T16:31:18.978Z,dc065361-3739-4ccb-b0ad-0bf81ee0da6a,en,What range is the fever at normally if COVID 1...,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
51,2020-03-21T11:41:51.954Z,abaf5c73-4246-463f-acdb-8acc5b349dee,en,Should I stay home,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
64,2020-03-21T09:58:36.672Z,cd7d48a5-e664-4b53-bfc4-0135570a8865,en,Can I visit my parents,en,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [125]:
dataset.to_csv(f"output/simple_{args.dataset[18:-4]}.csv")

In [127]:
dataset.groupby("cluster")["question"].count()

cluster
about         102
can            15
caution        36
future         20
hospital       14
incubation     15
infection      80
isolation      32
nextsteps     131
open            6
statistics     66
symptom       235
test          104
treatment      37
Name: question, dtype: int64

### Length analysis for situations