# Questions Clustering - English

## Expected
Questions to be sorted out such that the response to the whole cluster is same.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers  
License: CC BY-NC  
Date: 24 Mar, 2020 (Start)  

In [2]:
import re
import regex
import os
import csv
import sys
import json
import time
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [3]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        # self.filename = "query_result_2020-03-27T19_12_30.866993Z.csv"
        self.filename = "covid_questions_2020-04-21.csv"
        
        # path to the file
        self.dataset = f"data/{self.filename}"    
        
        # suffix used to create
        self.suffix = "_".join([word for word in self.filename.split("_") if not word.isalpha()])[:-4]
        self.vector_mode = "tfidf"
        self.n_topics = 230
        self.dist_thresh = 1.5
        self.lang = "en"
args=Args()

In [5]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

## Functions

In [6]:
# Regex functions
def surrounding(word,area=2):
    # A funtion, yet to be defined, that can be used to extract text around keywords
    return

def fuzzy_match(word,pattern):
    '''
        Fuzzy matching function to be used with .apply() of pandas

        Reason - Fuzzy matching is available in regex package, not in re package, 
        therefore fuzzy matching is not a part of pandas string matching functions
    '''
 
    if regex.search(pattern, word, re.IGNORECASE):
        return True
    else:
        return False    
    
# Language Detection
from langdetect import detect
def detect_lang(text):
    # Used to detect language of the question
    try:
        return detect(text)
    except:
        return "unidentifiable"   

### Natural Lanuage Proprocessing Functions
   
Preprocessing done -
   1. Normalizing accents  
   2. Removing non alphabetic characters  
   3. Casefolding  

Preprocessing not done -
   1. Stopword removal - For questions, stopwords are essential and thus are retained
   2. Stemming - For rule based analysis it might be useful to keep the words as they are, for LSA, the questions don't have enough variation in content to benefit for stemming

In [7]:
import re
import nltk
import unidecode

# Regular expression to select all that is not alphabet
# @maybe allow numbers as well
alpha_regex = re.compile('[^a-zA-Z]')

from nltk.corpus import stopwords as sw
if args.lang == "en": stopwords = sw.words('english')
if args.lang == "fr": stopwords = sw.words('french')

from nltk.stem.snowball import SnowballStemmer
if args.lang == "en": stemmer = SnowballStemmer("english")
if args.lang == "fr": stemmer = SnowballStemmer("french")

def tokenize(text):
    '''
        1. Normalized accents
        2. Splits at non alpbhaetic character (@maybe need to revisit for french text)
        3. Caasefolds
    '''    
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            # Handle french accents in text
            word = unidecode.unidecode(word)
            
            # Split at every non alphabet character occurrence
            clean_words = alpha_regex.sub(' ', word).split()
            
            # Casefold
            tokens.extend([word.lower() for word in clean_words])
    
    # Return tokens
    return tokens

def stem(word):
    return stemmer.stem(word).strip()

def preprocess(text):    
    tokenized = tokenize(text)
    # cleaned = [word for word in tokenized if word not in stopwords and word != '']
    # stemmed = [stem(word) for word in cleaned]
    return ' '.join(tokenized)

## Dataset

In [8]:
# Read dataset
dataset = pd.read_csv(args.dataset)
print(f"Dataset has {len(dataset)} documents")

# Detect Language
dataset["detected_lang"] = dataset.question.apply(detect_lang)

print(f"Dataset has {len(dataset[dataset.detected_lang=='en'])} english documents and {len(dataset[dataset.detected_lang=='fr'])} french documents")
print(f"{len(dataset)-len(dataset[dataset.detected_lang=='en'])-len(dataset[dataset.detected_lang=='fr'])} documents will not be processed because of different language")

Dataset has 5054 documents
Dataset has 2025 english documents and 2729 french documents
300 documents will not be processed because of different language


In [10]:
# Translation Questions from French to English
if "translated_question" not in dataset.columns:
    dataset["translated_question"] = np.nan

translated_df_path = f"data/{os.path.basename(args.filename)[:-4]}_fr.csv"
fr = pd.DataFrame(columns=["question","translated_question"])
if os.path.exists(translated_df_path):
    fr_csv = pd.read_csv(translated_df_path, index_col=0)
    fr = fr.combine_first(fr_csv)
    dataset.update(fr.translated_question)
    
# Any french questions that need to be translated?
fr_questions = dataset[
                        (dataset.detected_lang == "fr") & 
                        (dataset.translated_question.isnull())
                      ][["question","translated_question"]]

if len(fr_questions)>0:    
    try:
        from googletrans import Translator
        translator = Translator()
        translations = []
        for q in list(fr_questions.question):
            translations.append(translator.translate(q).text)
        print("Using google translate.")
    except:
        # HACK for when your IP has been blocked
        translations_file_path = f"data/{os.path.basename(args.filename)[:-4]}_fr_translations.txt"
        if os.path.exists(translations_file_path):   
            # Delete if already exists
            os.remove(translations_file_path)
        
        # Create file and initialize empty
        os.mknod(translations_file_path)      
        translations = []
        
        while (len(translations) != len(fr_questions)):
            print(f"Translate the output french to english using gogle translate and copy this into this file - {translations_file_path} \n{'-'*100}")
            print(*fr_questions.question.tolist(),sep="\n")
            # Translate the output french to english using gogle translate 
            # and copy this into a text file save as {filename}_fr_translated.txt in data folder
            input(f"{'-'*100}\nPress Enter to continue...")
            
            with open(translations_file_path,"r") as f:
                translations = f.read().split("\n")  
            
    fr_questions.translated_question = translations
    dataset.update(fr_questions.translated_question)
    dataset[dataset.detected_lang == "fr"][["question","translated_question"]].to_csv(translated_df_path)

In [11]:
# Preprocess questions
eng = dataset[dataset.detected_lang == "en"].question.apply(preprocess).to_frame(name="text")
french = dataset[dataset.detected_lang == "fr"].translated_question.apply(preprocess).to_frame(name="text")
dataset["text"] = eng.combine_first(french).dropna()

# Drop
dataset = dataset.dropna(subset=['text'])

print(f"Dataset has {len(dataset)} english documents (translated and otherwise)")

Dataset has 4754 english documents (translated and otherwise)


In [112]:
# dataset[(dataset.language != "en") & (dataset.detected_lang == "en")]

In [113]:
# dataset[(dataset.language == "en") & (dataset.detected_lang != "en")]

### Length analysis for situations

In [35]:
dataset["len"] = dataset.text.apply(lambda x: len(x.split()))

In [72]:
# Set default value
dataset["cluster"] = "unclassified"

In [73]:
dataset.len.describe()

count    4754.000000
mean       10.450989
std         9.632235
min         1.000000
25%         6.000000
50%         8.000000
75%        12.000000
max       173.000000
Name: len, dtype: float64

## Rules
Note: The order of these rules matters in resolving conflicts

In [74]:
# Separate very long questions out
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (dataset.len > 15)
                , "cluster" ] = "not-general"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [75]:
# Statistics
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("cases",case=False)|
                        dataset.text.str.contains("dea(?:th|d)(?:ly)?",case=False)|
                        dataset.text.str.contains("died",case=False)|
                        dataset.text.str.contains("(?:mortality|death|fatality) rate",case=False)|
                        dataset.text.str.contains("statistic",case=False)|
                        (
                            dataset.text.str.contains("how",case=False)&
                            dataset.text.str.contains("many",case=False)&
                            dataset.text.str.contains("people",case=False)
                        )
                    )
                , "cluster" ] = "situation-stats"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [76]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (dataset.text.str.contains(r"\b(?:animal|bird|cat|dog|pet)s?\b",case=False))
                , "cluster" ] = "covid-transmission-animals"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [106]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("mask",case=False)|
                        dataset.text.str.contains("glove",case=False)
                    )
                , "cluster" ] = "covid-precaution-gear"

dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("wash",case=False)
                    )
                , "cluster" ] = "covid-precaution-disinfection"

dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains(r"\bisolat",case=False)|
                        dataset.text.str.contains(r"\bsocial dist",case=False)|
                        dataset.text.str.contains(r"\bconfine",case=False)
                    )
                , "cluster" ] = "covid-precaution-isolation"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [80]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        (
                            (
                                dataset.text.str.contains("go (?:on|to|for|out)",case=False)|
                                dataset.text.str.contains("walk",case=False)
                            )&
                            (
                                dataset.text.str.contains("allow",case=False)|
                                dataset.text.str.contains("can",case=False)|
                                dataset.text.str.contains("ok|okay",case=False)|
                                dataset.text.str.contains("should|shall",case=False)
                            )
                        )|
                        (
                            dataset.text.str.contains("lockdown",case=False)|
                            dataset.text.str.contains(r"\bopen\b",case=False)|
                            dataset.text.str.contains(r"\bclose",case=False)
                        )
                    )
                , "cluster" ] = "situation-lockdown"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [81]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("if i (?:have|am|m)",case=False)
                    )
                , "cluster" ] = "covid-whatif"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [117]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains(r"i (?:have|ve)",case=False)|
                        (
                            dataset.text.str.contains(r"\b(?:has|have)",case=False)&
                            dataset.text.str.contains(r"symptom",case=False)
                        )
                        # dataset.text.str.contains(r"(?:i (?:think|feel) )?i \b(?:have|ve|am|m)\b",case=False)
                    )
                , "cluster" ] = "personal-symptoms"

In [116]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("transmi",case=False)|
                        dataset.text.str.contains("contract",case=False)|
                        dataset.text.str.contains("catch",case=False)|
                        dataset.text.str.contains("spread",case=False)|
                        dataset.text.str.contains("airborne",case=False)
                    )
                , "cluster" ] = "covid-transmission"

In [76]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("transmi",case=False)|
                        dataset.text.str.contains("contract",case=False)|
                        dataset.text.str.contains("catch",case=False)|
                        dataset.text.str.contains("spread",case=False)|
                        dataset.text.str.contains("airborne",case=False)
                    )                
                , "cluster" ] = "covid-transmission"

dataset.loc[
                    (dataset.cluster=="covid-transmission") & 
                    (
                        dataset.text.str.contains("again",case=False)|
                        dataset.text.str.contains("twice",case=False)
                    )                
                , "cluster" ] = "covid-transmission-twice"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [86]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            (
                                dataset.text.str.contains("live|stay|survive",case=False)
                            )&
                            (
                                dataset.text.str.contains("on",case=False)
                            )
                    )
                , "cluster" ] = "covid-life"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [87]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("infected",case=False)|
                        dataset.text.str.contains("infection",case=False)
                    )                
                , "cluster" ] = "covid-infection"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [88]:
# dataset.loc[
#                     (dataset.cluster=="unclassified") & 
#                     (
#                         dataset.text.str.contains("prevent",case=False)|
#                         dataset.text.str.contains("protect",case=False)|
#                         dataset.text.str.contains("precaution",case=False)|
#                         dataset.text.str.contains("safety",case=False)|
#                         (
#                             dataset.text.str.contains("keep",case=False)&
#                             dataset.text.str.contains("safe",case=False)
#                         )
#                     )               
#                 , "cluster" ] = "covid-precaution"

In [89]:
# dataset.loc[
#                     (dataset.cluster=="unclassified") & 
#                     (
#                             (
#                                 dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
#                                 dataset.text.str.contains("corona",case=False)|
#                                 dataset.text.str.contains("virus",case=False)
#                             )&
#                             (
#                                 dataset.text.str.contains("kills",case=False)
#                             )
#                     )                
#               , "cluster" ] = "covid-kill"

In [90]:
# dataset.loc[
#                     (dataset.cluster=="unclassified") & 
#                     (
#                             (
#                                 (
#                                     dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
#                                     dataset.text.str.contains("corona",case=False)|
#                                     dataset.text.str.contains("virus",case=False)
#                                 )&
#                                 (
#                                     dataset.text.str.contains("fight",case=False)
#                                 )&
#                                 (
#                                     dataset.text.str.contains("help",case=False)
#                                 )
#                             )|
#                             (
#                                 dataset.text.str.contains("mask",case=False)|
#                                 dataset.text.str.contains("glove",case=False)
#                             )
#                         )                
#                     , "cluster" ] = "covid-fight"

In [91]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("treatment",case=False)|
                        dataset.text.str.contains("cure",case=False)|
                        dataset.text.str.contains("vaccine",case=False)|
                        dataset.text.str.contains("medic",case=False)
                    )                
                , "cluster" ] = "covid-med"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [92]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("incubate",case=False)|
                        dataset.text.str.contains("incubation",case=False)
                    )      
                , "cluster" ] = "covid-incubation"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [93]:
# dataset.loc[                     
#                        (dataset.cluster=="unclassified") & (
# #                         dataset.text.str.contains(r"\bgo\b",case=False)&
#                         (
#                             dataset.text.str.contains("hospital",case=False)|                            
#                             dataset.text.str.contains(r"\bER\b",case=False)
#                         )
#                     )
#                , "cluster" ] = "hospital"

In [94]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("diff",case=False)|
                        dataset.text.apply(fuzzy_match, pattern="(?:distinguish){e<=3}")
                    )                
                , "cluster" ] = "covid-versus"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [96]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("tested",case=False)|
                        dataset.text.str.contains("test",case=False)
                    )                
                , "cluster" ] = "personal-testing"

dataset.loc[
                    (dataset.cluster=="personal-testing") & 
                    (
                        dataset.text.str.contains("(?:tested|test)",case=False) &
                        dataset.text.str.contains("where",case=False)
                    )                
                , "cluster" ] = "personal-testing-location"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [95]:
dataset.loc[
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("recover",case=False)
                    )                
                , "cluster" ] = "covid-infection-recovery"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [97]:
dataset.loc[                     
                    (dataset.cluster=="unclassified") & 
                    (
                        dataset.text.str.contains("dangerous",case=False)|
                        dataset.text.str.contains("risk",case=False)
                    )                
                , "cluster" ] = "covid-contagious"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [98]:
dataset.loc[                     
                    (dataset.cluster=="unclassified") & 
                    (dataset.text.str.contains(r"\bsymptom",case=False))                
                , "cluster" ] = "covid-symptoms"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [99]:
dataset.loc[                     
                    (dataset.cluster=="unclassified") & 
                    (
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:whats|what (?:is|s))")
                        ) & 
                        (
                            dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                            dataset.text.str.contains("corona",case=False)
                        )
                    )                
                , "cluster" ] = "covid-what"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [100]:
dataset.loc[                     
                    (dataset.cluster=="unclassified") & 
                    (
                        (
                            (
                                dataset.text.str.contains("how",case=False) &
                                dataset.text.str.contains("long",case=False)
                            )|
                            dataset.text.str.contains("when",case=False)
                        )&
                            dataset.text.str.contains("will",case=False)&
                        (
                            dataset.text.str.contains("last|end|over|normal|done",case=False)
                        )
                    )                
                , "cluster" ] = "situation-future"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [101]:
dataset.loc[                     
                    (dataset.cluster=="unclassified") & 
                    (
                        (
                            dataset.text.str.contains("how|when|where",case=False) 
                        )&
                            dataset.text.str.contains("did",case=False)&
                        (
                            dataset.text.str.contains("start|begin|began",case=False)
                        )
                    )                
                , "cluster" ] = "situation-past"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [102]:
features = dataset.cluster.unique().tolist()

In [103]:
features

['unclassified',
 'personal-isolation',
 'personal-testing',
 'personal-symptoms',
 'covid-transmission-animals',
 'covid-contagious',
 'not-general',
 'situation-lockdown',
 'covid-symptoms',
 'covid-versus',
 'covid-whatif',
 'covid-what',
 'covid-life',
 'situation-future',
 'situation-stats',
 'covid-transmission',
 'covid-med',
 'covid-precaution-gear',
 'covid-precaution-wash',
 'covid-infection',
 'covid-recovery',
 'covid-incubation',
 'situation-past']

In [104]:
# Clustering stats
dataset.groupby("cluster")["question"].count()

cluster
covid-contagious               142
covid-incubation                28
covid-infection                 97
covid-life                     117
covid-med                      106
covid-precaution-gear           71
covid-precaution-wash           47
covid-recovery                  21
covid-symptoms                 470
covid-transmission             184
covid-transmission-animals      58
covid-versus                    43
covid-what                     147
covid-whatif                   167
not-general                    673
personal-isolation              88
personal-symptoms              220
personal-testing               207
situation-future                40
situation-lockdown             161
situation-past                   4
situation-stats                169
unclassified                  1494
Name: question, dtype: int64

In [105]:
# Saving the files
path = f"output/simpleFiltered_{args.suffix}_{args.lang}.csv"
dataset.to_csv(path)
print(f"Rules based output saved to {path}")

Rules based output saved to output/simpleFiltered_2020-04-21_en.csv


## LSA and AHC

In [329]:
eng = dataset[dataset.detected_lang == "en"].question.to_frame(name="question")
french = dataset[dataset.detected_lang == "fr"].translated_question.to_frame(name="question")
d = eng.combine_first(french).dropna()

In [330]:
d["cluster"] = dataset.cluster
# Adding length as a feature
# d["len"] = dataset.len

In [331]:
d = d[d.cluster=="unclassified"]

In [332]:
import nltk
stopwords_list = nltk.corpus.stopwords.words('english')

In [333]:
wn_lemmatizer = nltk.stem.WordNetLemmatizer()
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [nltk.word_tokenize(text) for text in texts]

In [340]:
args.n_topics = 15
args.dist_thresh = 0.9
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.question, returnarray=False)

lsa_features = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()

use_len = False
if use_len:
    # Add len to the feature vector
    d[["embedding","len"]].apply(lambda x: x.embedding.append(x.len), axis=1)

    from sklearn import preprocessing

    x = d.embedding.tolist() #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    lsa_features = x_scaled
    
# Cluster
X = lsa_features
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["question"].count().sort_values()

Found 57 clusters


ahc_label
51      3
47      3
44      4
39      4
32      4
31      5
52      6
46      6
40      6
30      6
28      7
34      8
56      8
37      9
38      9
20     11
54     11
53     12
26     13
0      13
18     13
33     13
29     14
24     14
35     14
55     14
22     15
23     15
16     15
27     15
17     17
49     18
11     18
25     23
2      23
21     27
10     28
36     29
42     29
41     29
14     30
12     31
9      32
43     34
45     34
15     39
1      44
50     50
4      52
48     55
8      57
5      58
13     66
7      68
3      70
6      83
19    130
Name: question, dtype: int64

In [341]:
d.groupby("ahc_label")["question"].count().describe()

count     57.000000
mean      25.684211
std       24.243672
min        3.000000
25%        9.000000
50%       15.000000
75%       32.000000
max      130.000000
Name: question, dtype: float64

In [342]:
for label in d.ahc_label.unique():
    print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(d[d.ahc_label==label][:10].question.tolist())


cluster #13, count - 66
["What is Dialogue's position on sick notes?", 'What will happen to my job?', 'What is the danger for a newborn baby?', 'What is the% of the population affected by each country in the world?', "What's happen for country near the Ecuador with a lot of hot? ", 'What is the economic impact of this virus?', 'What is the contagion period?', 'What is the propagation time', 'What is the case fatality rate?', "What's the current count in India"]

cluster #45, count - 34
['What are Dialogue services?', 'What are the complications of the disease?', 'What are the instructions to follow?', 'What are the dangers for people over 70?', 'What are the rates of hospitalization and the need for intensive care', 'What are the barrier gestures?', 'What are the chances of getting the virus?', 'What are effective immunity boosters?', 'What are the essential services? ', 'What are the guidelines for municipal employees?']

cluster #27, count - 15
["I'd love to know how long I can be c

In [343]:
if "ahc_label" in dataset.columns:
    dataset = dataset.drop(columns=["ahc_label"])
dataset = dataset.join(d["ahc_label"])

In [348]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}.csv"
dataset.drop(features, axis="columns").drop(["total"], axis="columns").to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

AHC on top of rule based output saved to output/simpleLsa_15n0.9dt_2020-04-21.csv


## LSA and AHC - 2nd Time

In [221]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}_{args.lang}_checked.csv"
print(path)

output/simpleLsa_15n0.9dt_2020-04-14_fr_checked.csv


In [222]:
d = pd.read_csv(path, index_col=0)

In [223]:
d = d[["question","text", "translation", "cluster", "tags", "ahc_label", "cluster_count"]]

In [224]:
d.head()

Unnamed: 0,question,text,translation,cluster,tags,ahc_label,cluster_count
247,Les animaux peuvent-ils transmettre le COVID-19?,les animaux peuvent ils transmettre le covid,Can animals transmit COVID-19?,covid-animals,covid-animals,,
360,"Mon chat reste libre de ces mouvements, peut-i...",mon chat reste libre de ces mouvements peut il...,"My cat remains free from these movements, can ...",covid-animals,covid-animals,,
611,Les animaux (sauvages ou domestiques) peuvent-...,les animaux sauvages ou domestiques peuvent il...,Can animals (wild or domestic) contract the vi...,covid-animals,covid-animals,,
621,Combine de temps le virus peut rester sur le p...,combine de temps le virus peut rester sur le p...,How long can the virus stay on the hair of ani...,covid-animals,"covid-animals, covid-life",,
714,Est-ce que mon chat ou mon chien peuvent attra...,est ce que mon chat ou mon chien peuvent attra...,Can my cat or dog catch covid-19?,covid-animals,covid-animals,,


In [225]:
d = create_rules_based_clusters(d, cluster_col="cluster_2")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [226]:
d.head()

Unnamed: 0,question,text,translation,cluster,tags,ahc_label,cluster_count,cluster_2
247,Les animaux peuvent-ils transmettre le COVID-19?,les animaux peuvent ils transmettre le covid,Can animals transmit COVID-19?,covid-animals,covid-animals,,,covid-animals
360,"Mon chat reste libre de ces mouvements, peut-i...",mon chat reste libre de ces mouvements peut il...,"My cat remains free from these movements, can ...",covid-animals,"covid-animals, covid-life",,,unclassified
611,Les animaux (sauvages ou domestiques) peuvent-...,les animaux sauvages ou domestiques peuvent il...,Can animals (wild or domestic) contract the vi...,covid-animals,covid-animals,,,covid-animals
621,Combine de temps le virus peut rester sur le p...,combine de temps le virus peut rester sur le p...,How long can the virus stay on the hair of ani...,covid-animals,"covid-animals, covid-life",,,unclassified
714,Est-ce que mon chat ou mon chien peuvent attra...,est ce que mon chat ou mon chien peuvent attra...,Can my cat or dog catch covid-19?,covid-animals,covid-animals,,,covid-animals


In [333]:
# import nltk
# stopwords_list = nltk.corpus.stopwords.words('english')
# wn_lemmatizer = nltk.stem.WordNetLemmatizer()

In [334]:
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [text.split() for text in texts]

In [335]:
args.n_topics = 15
args.dist_thresh = 0.8
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.text, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["text"].count().sort_values()

Found 53 clusters


ahc_label
51     3
40     3
48     4
18     5
43     5
34     6
10     6
31     7
33     7
25     8
47     8
22     8
11     8
36     9
37     9
41    10
2     10
44    11
20    11
45    11
5     11
29    12
46    13
4     13
52    13
39    14
42    18
49    19
28    20
12    22
35    22
30    22
26    25
27    25
23    26
14    26
38    28
24    28
7     28
15    30
0     31
17    31
19    33
16    33
32    33
8     34
13    37
21    39
3     40
50    40
9     50
1     54
6     65
Name: text, dtype: int64

In [336]:
d.groupby("ahc_label")["text"].count().describe()

count    53.000000
mean     20.452830
std      14.209377
min       3.000000
25%       9.000000
50%      18.000000
75%      30.000000
max      65.000000
Name: text, dtype: float64

In [337]:
# To Translated
# print(*d.question.tolist(),sep="\n")

In [338]:
for label in d.ahc_label.unique():
    print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(*zip(d[d.ahc_label==label][:10].question.tolist(),d[d.ahc_label==label][:10].text.tolist()),sep="\n")


cluster #27, count - 25
('Est ce que la toux est sèche ou grasse?', 'est ce que la toux est seche ou grasse')
('Est ce que la question: arrivez vous de voyage, est ', 'est ce que la question arrivez vous de voyage est')
("Est ce que la question, arrivez vous de voyage est encore d'actualite?  Messeble que la propagation est au dela de ca? ", 'est ce que la question arrivez vous de voyage est encore d actualite messeble que la propagation est au dela de ca')
('Est-ce que la transmission communautaire est débuté?', 'est ce que la transmission communautaire est debute')
('Est ce que l’assurance maladie sera valide à l’extérieur de la province?', 'est ce que l assurance maladie sera valide a l exterieur de la province')
('Quand est-ce que ça va être terminé?', 'quand est ce que ca va etre termine')
('Est-ce que ca donne la diarrhée ?', 'est ce que ca donne la diarrhee')
('Est-ce vrai que 75% de la population sera infecté de toutes façons? ', 'est ce vrai que de la population sera infecte 

In [339]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,statistics,caution,animals,...,guideme,test,isolation,dangerisk,transmission,symptom,about,future,total,cluster
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,0,0,0,...,0,1,0,0,0,0,0,0,1,test
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",0,0,0,...,0,0,0,1,0,0,0,0,1,dangerisk
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified


In [340]:
dataset = dataset.drop(features, axis="columns").drop("total", axis="columns").join(d["ahc_label"])

In [341]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,cluster,ahc_label
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,test,
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",dangerisk,
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,unclassified,27.0
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,unclassified,4.0
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",unclassified,31.0


In [342]:
# print(*dataset.question.tolist(),sep="\n")

In [343]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}_{args.lang}.csv"
dataset.to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

AHC on top of rule based output saved to output/simpleLsa_15n0.8dt_2020-03-27T19_12_30.866993Z_fr.csv


## Iterative Clustering