# Questions Clustering

## Expected
Questions to be sorted out such that the response to the whole cluster is same.

<!--### To Do-->
Author: Sunanda Bansal  
Organization: Dataperformers    
License: CC BY-NC   
Date: 24 Mar, 2020 (Start)  

In [1]:
import re
import regex
import os
import csv
import sys
import json
import time
import nltk
import scipy
import socket
import pickle
import numpy as np
import pandas as pd
import multiprocessing as mp

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Normalizer   
from sklearn import metrics   
from sklearn.metrics import confusion_matrix
# from sklearn.metrics.pairwise import paired_distances as sklearn_paired_distances

# Plotting
import seaborn as sn
import matplotlib.pyplot as plt

# importing personal development helper classes
import utils

Using TensorFlow backend.


## Define variables here

Mostly the code will be intended to use with arguments that can be passed in comman line, but jupyter notebook doesn't handle `argparse` well, so the Args class is a temporary way to write the code assumming the variables to be an attribute of an object.

In [2]:
class Args:
    def __init__(self):
        # The very big scraped file, give absolute path, outside the repo
        # self.filename = "query_result_2020-03-27T19_12_30.866993Z.csv"
        self.filename = "data_dump_2020-04-14.csv"
        self.dataset = f"data/{self.filename}"        
        self.suffix = "_".join([word for word in self.filename.split("_") if not word.isalpha()])[:-4]
        self.vector_mode = "tfidf"
        self.n_topics = 230
        self.dist_thresh = 1.5
        self.lang = "fr"
args=Args()

In [3]:
# Disable (FALSE) displaying warnings from the OpenMP* run-time library during program execution.
os.environ['KMP_WARNINGS'] = "FALSE"

In [4]:
def surrounding(word,area=2):
    return

def fuzzy_match(word,pattern):
    import regex
    if regex.search(pattern, word, re.IGNORECASE):
        return True
    else:
        return False    

In [5]:
import re
import nltk
import unidecode

alpha_regex = re.compile('[^a-zA-Z]')

from nltk.corpus import stopwords as sw
if args.lang == "en": stopwords = sw.words('english')
if args.lang == "fr": stopwords = sw.words('french')

from nltk.stem.snowball import SnowballStemmer
if args.lang == "en": stemmer = SnowballStemmer("english")
if args.lang == "fr": stemmer = SnowballStemmer("french")

def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            word = unidecode.unidecode(word)
            clean_words = alpha_regex.sub(' ', word).split()
            tokens.extend([word.lower() for word in clean_words])
    return tokens


def stem(word):
    return stemmer.stem(word).strip()

def preprocess(text):    
    tokenized = tokenize(text)
    cleaned = [word for word in tokenized if word not in stopwords and word is not '']
#     stemed = [stem(word) for word in cleaned]
    #stemed = [stem(word) for word in tokenized]
    #corpus[i] = ' '.join(tokenized)
    return ' '.join(tokenized)

## Dataset

In [6]:
dataset = pd.read_csv(f"data/{args.filename}")
print(f"Dataset has {len(dataset)} documents")

# Language Detection
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unidentifiable"   

from langdetect import detect
dataset["detected_lang"] = dataset.question.apply(detect_lang)

dataset = dataset[dataset.detected_lang == args.lang]

dataset["text"] = dataset.question.apply(preprocess)
print(f"Dataset has {len(dataset)} french documents")

Dataset has 5005 documents
Dataset has 2709 french documents


In [362]:
translation = pd.read_csv("data/translated_fr_eng.csv",index_col=0)

In [363]:
dataset = dataset.join(translation["translation"])

In [7]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text
3,2020-03-19 14:00:19.88,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question
6,2020-03-19 13:47:48.848,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...
11,2020-03-19 15:39:52.508,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse
12,2020-03-19 14:16:03.429,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien
17,2020-03-19 13:55:41.747,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour


## Linguistic Analysis

In [1]:
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("fr_core_news_md")

In [12]:
text=["es ce facile de se faire tester si nous avons les symptomes",
    "si j ai la covid quels sont les symptomes qui devraient faire en sorte que j aille a l hopital"]

In [None]:
doc = nlp(text[0])

In [None]:
doc

In [None]:
for token in doc:
    print (token.text, token.tag_, token.head.text, token.dep_)

In [None]:
from spacy import displacy
displacy.serve(doc, style='dep')

## Rules
Note: The order of these rules matters in resolving conflicts

In [365]:
# Statistics
dataset["situation-stats"] = (
                                dataset.text.str.contains("taux de",case=False)|
                                dataset.text.str.contains("statistique",case=False)|
                                (
                                    (
                                        dataset.text.str.contains("combien",case=False)|
                                        dataset.text.str.contains("nombre",case=False)
                                    )&
                                    dataset.text.str.contains("de (?:cas|person|gen)",case=False)
                                )
                            ).apply(int)

In [366]:
# dataset["caution"] = (
#                         dataset.text.str.contains("proteger",case=False)|
#                         dataset.text.str.contains("gant",case=False)|
#                         dataset.text.str.contains("masque",case=False)|
#                         dataset.text.str.contains("masque",case=False)|
#                         dataset.text.str.contains("je peux",case=False)
#                     ).apply(int)

In [367]:
dataset["covid-animals"] = (
                                dataset.text.str.contains("animau|chat|chien",case=False)
                            ).apply(int)

In [368]:
dataset["covid-life"] = (
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)|
                                dataset.text.str.contains("virus",case=False)
                            )&
                            dataset.text.str.contains("survi|reste|demeure|vie|survivre|vivre",case=False)&
                            dataset.text.str.contains("de temp",case=False)
                        ).apply(int)

In [369]:
dataset["covid-versus"] = (
                                dataset.text.str.contains("diff",case=False) &
                                dataset.text.str.contains("entre",case=False) &
                                dataset.text.str.contains("et",case=False)&
                                (
                                    dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                    dataset.text.str.contains("corona",case=False)|
                                    dataset.text.str.contains("virus",case=False)
                                )
                            ).apply(int)

In [370]:
dataset["covid-med"] = (
                            dataset.text.str.contains("guerir|vaccin|medic|traitement",case=False)
                        ).apply(int)

In [371]:
dataset["covid-incubation"] = (
                                    dataset.text.str.contains("incubat",case=False)
                                ).apply(int)

In [372]:
dataset["personal-symptoms"] = (
                                    dataset.text.str.contains("j ai|je tousse",case=False) 
                                ).apply(int)

In [373]:
# dataset["guideme"] = (
#                         dataset.text.str.contains("\bsi\b",case=False)
#                     ).apply(int)

In [374]:
dataset["personal-testing"] = (
                                    dataset.text.str.contains("test",case=False)
                                ).apply(int)

In [375]:
dataset["personal-isolation"] = (
                                    dataset.text.str.contains("confine|isole",case=False)|
                                    (
                                        dataset.text.str.contains("reste",case=False)&
                                        dataset.text.str.contains("chez",case=False)
                                    )
                                ).apply(int)

In [376]:
dataset["covid-contagious"] = (
                                    dataset.text.str.contains("contagieu",case=False)
                                ).apply(int)

In [377]:
# dataset["dangerisk"] = (
#                             dataset.text.str.contains("dangereu|risque|contagieu",case=False)
#                         ).apply(int)

In [378]:
dataset["covid-transmission"] = (
                                    dataset.text.str.contains("comment",case=False)&
                                    (
                                        dataset.text.str.contains("se (?:propage|tansmet|attrape|contracte)",case=False)
                                    )
                                  ).apply(int)

In [379]:
# Fuzzy Matching of 'Symptom' keyword (accounting for spelling errors)
dataset["covid-symptom"] = (
                                dataset.text.apply(fuzzy_match, pattern="(?:symptome){1<=e<=3}")
                            ).apply(int)

In [380]:
dataset["covid-what"] = (
                            (
                                dataset.text.str.contains("qu est ce que|c est quoi",case=False)
                            ) & 
                            (
                                dataset.text.apply(fuzzy_match, pattern="(?:covid){e<=2}")|
                                dataset.text.str.contains("corona",case=False)
                            )
                        ).apply(int)

In [381]:
dataset["situation-future"] = (
                                    dataset.text.str.contains("combien",case=False)&
                                    dataset.text.str.contains("de temp",case=False)&
                                    dataset.text.str.contains(r"\bdure",case=False)
                                ).apply(int)

In [394]:
features = [col_name for col_name in dataset.columns.values.tolist() if "-" in col_name]

In [426]:
features

['situation-stats',
 'covid-animals',
 'covid-life',
 'covid-versus',
 'covid-med',
 'covid-incubation',
 'personal-symptoms',
 'personal-testing',
 'personal-isolation',
 'covid-contagious',
 'covid-transmission',
 'covid-symptom',
 'covid-what',
 'situation-future']

In [396]:
dataset["total"] = dataset[features].sum(axis=1)

In [397]:
dataset.groupby("total").describe()

Unnamed: 0_level_0,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,situation-stats,covid-animals,covid-animals,...,covid-what,covid-what,situation-future,situation-future,situation-future,situation-future,situation-future,situation-future,situation-future,situation-future
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
total,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,1317.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1317.0,0.0,...,0.0,0.0,1317.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,874.0,0.05492,0.227954,0.0,0.0,0.0,0.0,1.0,874.0,0.046911,...,0.0,1.0,874.0,0.026316,0.160164,0.0,0.0,0.0,0.0,1.0
2,156.0,0.012821,0.112862,0.0,0.0,0.0,0.0,1.0,156.0,0.012821,...,0.0,1.0,156.0,0.057692,0.233912,0.0,0.0,0.0,0.0,1.0
3,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,...,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0


In [429]:
def tag_from_cols(row):
    return ", ".join([f for f in features if row[f]==1])

In [437]:
# tag_from_cols(dataset.iloc[22])

In [438]:
dataset["tags"] = dataset.apply(tag_from_cols, axis=1)

In [435]:
# dataset[dataset.total > 1].head()

In [439]:
dataset["cluster"] = "unclassified"

# For single features
for col in features:
    dataset["cluster"][(dataset.total == 1) & (dataset[col] == True)] = col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [440]:
# for col in features[::-1]:
#     dataset["cluster"][(dataset.total > 1) & (dataset[col] == True)] = col

In [441]:
path = f"output/simple_{args.suffix}_{args.lang}.csv"
dataset.to_csv(path)
print(f"Rules based output saved to {path}")

Rules based output saved to output/simple_2020-03-27T19_12_30.866993Z_fr.csv


In [442]:
dataset.groupby("cluster")["question"].count()

cluster
covid-animals           41
covid-contagious        32
covid-incubation         9
covid-life              56
covid-med               52
covid-symptom          257
covid-transmission       5
covid-versus             6
covid-what              18
personal-isolation      72
personal-symptoms      154
personal-testing       101
situation-future        23
situation-stats         48
unclassified          1498
Name: question, dtype: int64

### Length analysis for situations

## LSA and AHC

In [332]:
d = dataset[dataset.cluster=="unclassified"][["question","text", "cluster"]]

In [333]:
# import nltk
# stopwords_list = nltk.corpus.stopwords.words('english')
# wn_lemmatizer = nltk.stem.WordNetLemmatizer()

In [334]:
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [text.split() for text in texts]

In [335]:
args.n_topics = 15
args.dist_thresh = 0.8
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.text, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["text"].count().sort_values()

Found 53 clusters


ahc_label
51     3
40     3
48     4
18     5
43     5
34     6
10     6
31     7
33     7
25     8
47     8
22     8
11     8
36     9
37     9
41    10
2     10
44    11
20    11
45    11
5     11
29    12
46    13
4     13
52    13
39    14
42    18
49    19
28    20
12    22
35    22
30    22
26    25
27    25
23    26
14    26
38    28
24    28
7     28
15    30
0     31
17    31
19    33
16    33
32    33
8     34
13    37
21    39
3     40
50    40
9     50
1     54
6     65
Name: text, dtype: int64

In [336]:
d.groupby("ahc_label")["text"].count().describe()

count    53.000000
mean     20.452830
std      14.209377
min       3.000000
25%       9.000000
50%      18.000000
75%      30.000000
max      65.000000
Name: text, dtype: float64

In [337]:
# To Translated
# print(*d.question.tolist(),sep="\n")

In [338]:
for label in d.ahc_label.unique():
    print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(*zip(d[d.ahc_label==label][:10].question.tolist(),d[d.ahc_label==label][:10].text.tolist()),sep="\n")


cluster #27, count - 25
('Est ce que la toux est sèche ou grasse?', 'est ce que la toux est seche ou grasse')
('Est ce que la question: arrivez vous de voyage, est ', 'est ce que la question arrivez vous de voyage est')
("Est ce que la question, arrivez vous de voyage est encore d'actualite?  Messeble que la propagation est au dela de ca? ", 'est ce que la question arrivez vous de voyage est encore d actualite messeble que la propagation est au dela de ca')
('Est-ce que la transmission communautaire est débuté?', 'est ce que la transmission communautaire est debute')
('Est ce que l’assurance maladie sera valide à l’extérieur de la province?', 'est ce que l assurance maladie sera valide a l exterieur de la province')
('Quand est-ce que ça va être terminé?', 'quand est ce que ca va etre termine')
('Est-ce que ca donne la diarrhée ?', 'est ce que ca donne la diarrhee')
('Est-ce vrai que 75% de la population sera infecté de toutes façons? ', 'est ce vrai que de la population sera infecte 

In [339]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,statistics,caution,animals,...,guideme,test,isolation,dangerisk,transmission,symptom,about,future,total,cluster
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,0,0,0,...,0,1,0,0,0,0,0,0,1,test
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",0,0,0,...,0,0,0,1,0,0,0,0,1,dangerisk
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified


In [340]:
dataset = dataset.drop(features, axis="columns").drop("total", axis="columns").join(d["ahc_label"])

In [341]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,cluster,ahc_label
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,test,
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",dangerisk,
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,unclassified,27.0
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,unclassified,4.0
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",unclassified,31.0


In [342]:
# print(*dataset.question.tolist(),sep="\n")

In [343]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}_{args.lang}.csv"
dataset.to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

AHC on top of rule based output saved to output/simpleLsa_15n0.8dt_2020-03-27T19_12_30.866993Z_fr.csv


## LSA and AHC - 2nd Time

In [None]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}_{args.lang}_checked.csv"

In [None]:
d = pd.read_csv(path, index_col=0)

In [332]:
d = dataset[dataset.cluster=="unclassified"][["question","text", "translation", "cluster", "ahc_label"]]

In [333]:
# import nltk
# stopwords_list = nltk.corpus.stopwords.words('english')
# wn_lemmatizer = nltk.stem.WordNetLemmatizer()

In [334]:
def bulk_tokenizer(texts):
#      return [[wn_lemmatizer.lemmatize(token) for token in nltk.word_tokenize(text)] for text in texts]
     return [text.split() for text in texts]

In [335]:
args.n_topics = 15
args.dist_thresh = 0.8
model = utils.text.representation.LSI(args, tokenizer=bulk_tokenizer)
d["embedding"] = model.generate_embedding(d.text, returnarray=False)

# Cluster
X = pd.DataFrame(d["embedding"].values.tolist(), index= d.index).to_numpy()
clustering = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, distance_threshold=args.dist_thresh).fit(X)
d["ahc_label"] = clustering.labels_

# Misc.
args.n_clusters = len(d["ahc_label"].unique())
print(f"Found {args.n_clusters} clusters")
d.groupby("ahc_label")["text"].count().sort_values()

Found 53 clusters


ahc_label
51     3
40     3
48     4
18     5
43     5
34     6
10     6
31     7
33     7
25     8
47     8
22     8
11     8
36     9
37     9
41    10
2     10
44    11
20    11
45    11
5     11
29    12
46    13
4     13
52    13
39    14
42    18
49    19
28    20
12    22
35    22
30    22
26    25
27    25
23    26
14    26
38    28
24    28
7     28
15    30
0     31
17    31
19    33
16    33
32    33
8     34
13    37
21    39
3     40
50    40
9     50
1     54
6     65
Name: text, dtype: int64

In [336]:
d.groupby("ahc_label")["text"].count().describe()

count    53.000000
mean     20.452830
std      14.209377
min       3.000000
25%       9.000000
50%      18.000000
75%      30.000000
max      65.000000
Name: text, dtype: float64

In [337]:
# To Translated
# print(*d.question.tolist(),sep="\n")

In [338]:
for label in d.ahc_label.unique():
    print(f"\ncluster #{label}, count - {len(d[d.ahc_label==label])}")
    print(*zip(d[d.ahc_label==label][:10].question.tolist(),d[d.ahc_label==label][:10].text.tolist()),sep="\n")


cluster #27, count - 25
('Est ce que la toux est sèche ou grasse?', 'est ce que la toux est seche ou grasse')
('Est ce que la question: arrivez vous de voyage, est ', 'est ce que la question arrivez vous de voyage est')
("Est ce que la question, arrivez vous de voyage est encore d'actualite?  Messeble que la propagation est au dela de ca? ", 'est ce que la question arrivez vous de voyage est encore d actualite messeble que la propagation est au dela de ca')
('Est-ce que la transmission communautaire est débuté?', 'est ce que la transmission communautaire est debute')
('Est ce que l’assurance maladie sera valide à l’extérieur de la province?', 'est ce que l assurance maladie sera valide a l exterieur de la province')
('Quand est-ce que ça va être terminé?', 'quand est ce que ca va etre termine')
('Est-ce que ca donne la diarrhée ?', 'est ce que ca donne la diarrhee')
('Est-ce vrai que 75% de la population sera infecté de toutes façons? ', 'est ce vrai que de la population sera infecte 

In [339]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,statistics,caution,animals,...,guideme,test,isolation,dangerisk,transmission,symptom,about,future,total,cluster
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,0,0,0,...,0,1,0,0,0,0,0,0,1,test
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",0,0,0,...,0,0,0,1,0,0,0,0,1,dangerisk
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",0,0,0,...,0,0,0,0,0,0,0,0,0,unclassified


In [340]:
dataset = dataset.drop(features, axis="columns").drop("total", axis="columns").join(d["ahc_label"])

In [341]:
dataset.head()

Unnamed: 0,timestamp_est,anonymous_id,language,question,detected_lang,text,translation,cluster,ahc_label
3,2020-03-19T14:00:19.88Z,71a29314-2b32-43d5-b09a-1795a5380e60,fr,LP TEST question,fr,lp test question,LP TEST question,test,
6,2020-03-19T13:47:48.848Z,0b0df84f-dcc5-42c0-980e-bafacd3bbdbe,fr,"Je fais une sarcoidose pulmonaire, je suis tu ...",fr,je fais une sarcoidose pulmonaire je suis tu p...,"I have pulmonary sarcoidosis, I'm more at risk...",dangerisk,
11,2020-03-19T15:39:52.508Z,b3cd859f-a9f3-4985-a884-0a8387a9daa4,fr,Est ce que la toux est sèche ou grasse?,fr,est ce que la toux est seche ou grasse,Is the cough dry or oily?,unclassified,27.0
12,2020-03-19T14:16:03.429Z,34327a70-7e54-4f53-bdf6-6809c83a8d5e,fr,Je ne me sens pas bien,fr,je ne me sens pas bien,,unclassified,4.0
17,2020-03-19T13:55:41.747Z,59eafb5a-76c4-4e07-891d-6e79276fa82d,fr,"Bonjour,",fr,bonjour,"Bonjour,",unclassified,31.0


In [342]:
# print(*dataset.question.tolist(),sep="\n")

In [343]:
path = f"output/simpleLsa_{args.n_topics}n{args.dist_thresh}dt_{args.suffix}_{args.lang}.csv"
dataset.to_csv(path)
print(f"AHC on top of rule based output saved to {path}")

AHC on top of rule based output saved to output/simpleLsa_15n0.8dt_2020-03-27T19_12_30.866993Z_fr.csv


## Iterative Clustering