# Questions Clustering - English

## Expected
Questions to be sorted out such that the response to the whole cluster is same.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
License: CC BY-NC  
Date: 24 Mar, 2020 (Start)  

In [1]:
import re
import regex
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [2]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        # self.filename = "query_result_2020-03-27T19_12_30.866993Z.csv"
        self.filename = "covid_questions_2020-04-21.csv"
        
        # path to the file
        self.dataset = f"data/{self.filename}"    
        
        # suffix used to create
        self.suffix = "_".join([word for word in self.filename.split("_") if not word.isalpha()])[:-4]
        self.vector_mode = "tfidf"
        self.n_topics = 230
        self.dist_thresh = 1.5
        self.lang = "en"
args=Args()

In [3]:
# This dictionary is used to translate old labels to new labels minimizing modifications required for the moment
new_labels = {
                "about": "covid-what",
                "animals": "covid-animals",
                "caution": "personal-caution",
                "dangerisk": "covid-contagious",
                "diff": "covid-versus",
                "future": "situation-future",
                "guideme": "personal-whatif",
                "incubation": "covid-incubation",
                "infection": "covid-infection",
                "isolation": "personal-isolation",
                "lockdown": "situation-lockdown",
                "nextsteps": "personal-symptoms",
                "past": "situation-past",
                "recover": "covid-recovery",
                "statistics": "situation-stats",
                "symptom": "covid-symptoms",
                "test": "personal-testing",
                "transmission": "covid-transmission",
                "treatment": "covid-med",
                "unclassified": "unclassified",
                "virusfight": "covid-fight",
                "viruskill": "covid-kill",
                "viruslife": "covid-life"
            }

In [4]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

## Functions

In [5]:
# Regex functions
def surrounding(word,area=2):
    # A funtion, yet to be defined, that can be used to extract text around keywords
    return

def fuzzy_match(word,pattern):
    '''
        Fuzzy matching function to be used with .apply() of pandas

        Reason - Fuzzy matching is available in regex package, not in re package, 
        therefore fuzzy matching is not a part of pandas string matching functions
    '''
 
    if regex.search(pattern, word, re.IGNORECASE):
        return True
    else:
        return False    
    
# Language Detection
from langdetect import detect
def detect_lang(text):
    # Used to detect language of the question
    try:
        return detect(text)
    except:
        return "unidentifiable"   

### Natural Lanuage Proprocessing Functions
   
Preprocessing done -
   1. Normalizing accents  
   2. Removing non alphabetic characters  
   3. Casefolding  

Preprocessing not done -
   1. Stopword removal - For questions, stopwords are essential and thus are retained
   2. Stemming - For rule based analysis it might be useful to keep the words as they are, for LSA, the questions don't have enough variation in content to benefit for stemming

In [6]:
import re
import nltk
import unidecode

# Regular expression to select all that is not alphabet
# @maybe allow numbers as well
alpha_regex = re.compile('[^a-zA-Z]')

from nltk.corpus import stopwords as sw
if args.lang == "en": stopwords = sw.words('english')
if args.lang == "fr": stopwords = sw.words('french')

from nltk.stem.snowball import SnowballStemmer
if args.lang == "en": stemmer = SnowballStemmer("english")
if args.lang == "fr": stemmer = SnowballStemmer("french")

def tokenize(text):
    '''
        1. Normalized accents
        2. Splits at non alpbhaetic character (@maybe need to revisit for french text)
        3. Caasefolds
    '''    
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            # Handle french accents in text
            word = unidecode.unidecode(word)
            
            # Split at every non alphabet character occurrence
            clean_words = alpha_regex.sub(' ', word).split()
            
            # Casefold
            tokens.extend([word.lower() for word in clean_words])
    
    # Return tokens
    return tokens

def stem(word):
    return stemmer.stem(word).strip()

def preprocess(text):    
    tokenized = tokenize(text)
    # cleaned = [word for word in tokenized if word not in stopwords and word != '']
    # stemmed = [stem(word) for word in cleaned]
    return ' '.join(tokenized)

## Dataset

In [88]:
# Read dataset
dataset = pd.read_csv(args.dataset)
print(f"Dataset has {len(dataset)} documents")

# Detect Language
dataset["detected_lang"] = dataset.question.apply(detect_lang)

print(f"Dataset has {len(dataset[dataset.detected_lang=='en'])} english documents and {len(dataset[dataset.detected_lang=='fr'])} french documents")

Dataset has 5054 documents
Dataset has 2030 english documents and 2719 french documents


In [89]:
# Translation Questions from French to English
if "translated_question" not in dataset.columns:
    dataset["translated_question"] = np.nan

translated_df_path = f"data/{os.path.basename(args.filename)[:-4]}_fr.csv"
fr = pd.DataFrame(columns=["question","translated_question"])
if os.path.exists(translated_df_path):
    fr_csv = pd.read_csv(translated_df_path, index_col=0)
    fr = fr.combine_first(fr_csv)
    dataset.update(fr.translated_question)
    
# Any french questions that need to be translated?
fr_questions = dataset[
                        (dataset.detected_lang == "fr") & 
                        (dataset.translated_question.isnull())
                      ][["question","translated_question"]]

if len(fr_questions)>0:
    googletrans_fail = False
    
    try:
        from googletrans import Translator
        translator = Translator()
        translations = []
        for q in list(fr_questions.question):
            translations.append(translator.translate(q))
    except:
        # HACK for when your IP has been blocked
        googletrans_fail = True
        try_again = "yes"
        translations_file_path = f"data/{os.path.basename(args.filename)[:-4]}_fr_translations.txt"
        if not os.path.exists(translations_file_path):    
            os.mknod(translations_file_path)           

        with open(translations_file_path,"r") as f:
            translations = f.read().split("\n")  
        
        while (len(translations) != len(fr_questions)):
            print(f"Translate the output french to english using gogle translate and copy this into this file - {translations_file_path} \n{'-'*100}")
            print(*fr_questions.question.tolist(),sep="\n")
            # Translate the output french to english using gogle translate 
            # and copy this into a text file save as {filename}_fr_translated.txt in data folder
            input(f"{'-'*100}\nPress Enter to continue...")
            
            with open(translations_file_path,"r") as f:
                translations = f.read().split("\n")  
            
    fr_questions.translated_question = translations
    dataset.update(fr_questions.translated_question)
    dataset[dataset.detected_lang == "fr"][["question","translated_question"]].to_csv(translated_df_path)

Translate the output french to english using gogle translate and copy this into this file - data/covid_questions_2020-04-21_fr_translations.txt 
----------------------------------------------------------------------------------------------------
Test 
Are you sick?
Pouvez vous me répondre en français, sinon, je suis parfaitement bilingue, je travaille dans l’industrie de solutions technologie and it would be my pleasure to help this project any way I can. Jennifer Charron at Workday Canada from Montreal. 
Combien de temps va durer la quarantaine 
Est-ce vous avez mal a la gorge?
À quand la Heineken virus? 
Bonjour!
Bonjour
Bonjour!
Combien de temps dure la phase active de la maladie
Exemple ?
----------------------------------------------------------------------------------------------------
Press Enter to continue...


In [106]:
# Filter questions by language
dataset = dataset[
                    (dataset.detected_lang == "en") |
                    (~dataset.translated_question.isnull())
                  ]

eng = dataset[dataset.detected_lang == "en"].question.apply(preprocess).to_frame(name="text")
french = dataset[dataset.detected_lang == "fr"].translated_question.apply(preprocess).to_frame(name="text")
dataset["text"] = eng.combine_first(french)

# Preprocess questions

print(f"Dataset has {len(dataset)} english documents (translated and otherwise)")

Dataset has 4764 english documents (translated and otherwise)


In [None]:
# dataset[(dataset.language != "en") & (dataset.detected_lang == "en")]

In [None]:
# dataset[(dataset.language == "en") & (dataset.detected_lang != "en")]

## Rules
Note: The order of these rules matters in resolving conflicts

In [None]:
# Statistics
dataset["statistics"] = (
                        dataset.text.str.contains("cases",case=False)|
                        dataset.text.str.contains("death",case=False)|
                        dataset.text.str.contains("died",case=False)|
                        dataset.text.str.contains("mortality rate",case=False)|
                        dataset.text.str.contains("death rate",case=False)|
                        dataset.text.str.contains("deadly",case=False)|
                        dataset.text.str.contains("statistic",case=False)|
                        (
                            dataset.text.str.contains("how",case=False)&
                            dataset.text.str.contains("many",case=False)&
                            dataset.text.str.contains("people",case=False)
                        )
                    ).apply(int)

In [None]:
dataset["animals"] = (
                        dataset.text.str.contains(r"\b(?:animal|bird|cat|dog)s?\b",case=False)
                    ).apply(int)

In [None]:
dataset["caution"] = (
                        dataset.text.str.contains("prevent",case=False)|
                        dataset.text.str.contains("protect",case=False)|
                        dataset.text.str.contains("precaution",case=False)|
                        dataset.text.str.contains("safety",case=False)|
                        (
                            dataset.text.str.contains("keep",case=False)&
                            dataset.text.str.contains("safe",case=False)
                        )
                    ).apply(int)

In [None]:
dataset["viruslife"] = (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            (
                                dataset.text.str.contains("live|stay|survive",case=False)
                            )&
                            (
                                dataset.text.str.contains("on",case=False)
                            )
                    ).apply(int)

In [None]:
dataset["viruskill"] = (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            (
                                dataset.text.str.contains("kills",case=False)
                            )
                    ).apply(int)

In [None]:
dataset["virusfight"] = (
                            (
                                (
                                    dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                    dataset.text.str.contains("corona",case=False)|
                                    dataset.text.str.contains("virus",case=False)
                                )&
                                (
                                    dataset.text.str.contains("fight",case=False)
                                )&
                                (
                                    dataset.text.str.contains("help",case=False)
                                )
                            )|
                            (
                                dataset.text.str.contains("mask",case=False)|
                                dataset.text.str.contains("glove",case=False)
                            )
                        ).apply(int)

In [None]:
dataset["treatment"] = (
                        dataset.text.str.contains("treatment",case=False)|
                        dataset.text.str.contains("cure",case=False)|
                        dataset.text.str.contains("vaccine",case=False)|
                        dataset.text.str.contains("medic",case=False)
                    ).apply(int)

In [None]:
dataset["incubation"] = (
                        dataset.text.str.contains("incubate",case=False)|
                        dataset.text.str.contains("incubation",case=False)
                    ).apply(int)

In [None]:
dataset["nextsteps"] = (
                        dataset.text.str.contains("i have",case=False) 
                    ).apply(int)

In [None]:
dataset["guideme"] = (
                        dataset.text.str.contains("if",case=False)
                    ).apply(int)

In [None]:
# dataset["hospital"] = (
# #                         dataset.text.str.contains(r"\bgo\b",case=False)&
#                         (
#                             dataset.text.str.contains("hospital",case=False)|                            
#                             dataset.text.str.contains(r"\bER\b",case=False)
#                         )
#                     ).apply(int)

In [None]:
# Dos and Donts
dataset["lockdown"] = (
                        (
                            (
                                dataset.text.str.contains("go (?:on|to|for|out)",case=False)|
                                dataset.text.str.contains("walk",case=False)
                            )&
                            (
                                dataset.text.str.contains("allow",case=False)|
                                dataset.text.str.contains("can",case=False)|
                                dataset.text.str.contains("ok|okay",case=False)|
                                dataset.text.str.contains("should|shall",case=False)
                            )
                        )|
                        (
                            dataset.text.str.contains("lockdown",case=False)|
                            dataset.text.str.contains(r"\bopen\b",case=False)|
                            dataset.text.str.contains(r"\bclose",case=False)
                        )
                    ).apply(int)

In [None]:
dataset["infection"] = (
                        dataset.text.str.contains("infected",case=False)|
                        dataset.text.str.contains("infection",case=False)
                    ).apply(int)

In [None]:
dataset["diff"] = (
                        dataset.text.str.contains("diff",case=False)|
                        dataset.text.apply(fuzzy_match, pattern="(?:distinguish){e<=3}")
                    ).apply(int)

In [None]:
dataset["recover"] = (
                        dataset.text.str.contains("recover",case=False)
                    ).apply(int)

In [None]:
dataset["test"] = (
                        dataset.text.str.contains("tested",case=False)|
                        dataset.text.str.contains("test",case=False)
                    ).apply(int)

In [None]:
dataset["isolation"] = (
                            dataset.text.str.contains(r"\bisolat",case=False)|
                            dataset.text.str.contains(r"\bsocial dist",case=False)
                        ).apply(int)

In [None]:
dataset["dangerisk"] = (
                            dataset.text.str.contains("dangerous",case=False)|
                            dataset.text.str.contains("risk",case=False)
                        ).apply(int)

In [None]:
dataset["transmission"] = (
                            dataset.text.str.contains("transmi",case=False)|
                            dataset.text.str.contains("contract",case=False)|
                            dataset.text.str.contains("spread",case=False)|
                            dataset.text.apply(fuzzy_match, pattern="(?:airborne){e<=3}")
                        ).apply(int)

In [None]:
# Fuzzy Matching of 'Symptom' keyword (accounting for spelling errors)
dataset["symptom"] = (
                        dataset.text.apply(fuzzy_match, pattern="(?:symptom){1<=e<=3}")
                    ).apply(int)

In [None]:
dataset["about"] = (
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:whats|what (?:is|s))")
                        ) & 
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                            dataset.text.str.contains("corona",case=False)
                        )
                    ).apply(int)

In [None]:
dataset["future"] = (
                        (
                            (
                                dataset.text.str.contains("how",case=False) &
                                dataset.text.str.contains("long",case=False)
                            )|
                            dataset.text.str.contains("when",case=False)
                        )&
                            dataset.text.str.contains("will",case=False)&
                        (
                            dataset.text.str.contains("last|end|over|normal|done",case=False)
                        )
                    ).apply(int)

In [None]:
dataset["past"] = (
                        (
                            dataset.text.str.contains("how|when|where",case=False) 
                        )&
                            dataset.text.str.contains("did",case=False)&
                        (
                            dataset.text.str.contains("start|begin|began",case=False)
                        )
                    ).apply(int)

In [None]:
dataset = dataset.rename(columns=new_labels)

In [None]:
features = [col_name for col_name in dataset.columns.values.tolist() if "-" in col_name]

In [None]:
features

In [None]:
# Get total number of classes/categories this question qualifies for
dataset["total"] = dataset[features].sum(axis=1)

In [None]:
# Describes how many questions quality for how many classes
dataset.groupby("total")["situation-stats"].describe()["count"]

In [None]:
# Set default value
dataset["cluster"] = "unclassified"

# For single features
for col in features:
    dataset["cluster"][(dataset.total == 1) & (dataset[col] == True)] = col

In [None]:
# Resolving multiple classes
for col in features[::-1]:
    dataset["cluster"][(dataset.total > 1) & (dataset[col] == True)] = col

In [None]:
# Saving the files
path = f"output/simple_{args.suffix}_{args.lang}.csv"
dataset.drop(features, axis="columns").drop(["text","total"], axis="columns").to_csv(path)
print(f"Rules based output saved to {path}")

In [None]:
# Clustering stats
dataset.groupby("cluster")["question"].count()

### Length analysis for situations

## LSA and AHC

In [None]:
d = dataset[dataset.cluster=="unclassified"][["question", "cluster"]]

In [None]:
import nltk
stopwords_list = nltk.corpus.stopwords.words('english')

In [None]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [nltk.word_tokenize(text) for text in texts]

In [None]:
args.n_topics = 15
args.dist_thresh = 0.7
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

In [None]:
d.groupby("ahc_label")["question"].count().describe()

In [None]:
# for label in d.ahc_label.unique():
#     print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
#     print(d[d.ahc_label==label][:10].question.tolist())

In [None]:
dataset = dataset.join(d["ahc_label"])

In [None]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}.csv"
dataset.to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

## Iterative Clustering