### This notebook cleans text data from .csv files and exports it as .jsonl files for annotation

In [12]:
import numpy as np
import pandas as pd
import datetime as dt
# other modules, code
from html import unescape
import unicodedata
import ast

import spacy
import srsly
from spacy import displacy
from spacy.training import docs_to_json, offsets_to_biluo_tags, biluo_tags_to_spans
from spacy.pipeline import EntityRuler
from spacy.tokens import DocBin

from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Span
from toolz import partition_all

In [13]:
def clean_df(df0):
    '''
    Clean original dataframes, drop null values and merge title, remove posts marked deleted etc.
    '''
    df = df0.copy() #BAD CODE since creating a copy inside the function. Fix it!!!!!
    
    # fill nan with empty string
    df['selftext'] = df['selftext'].fillna('')
    df['flair_text'] = df['flair_text'].fillna('no_flair')
    
    # drop rows where selftext was deleted or removed
    df = df[~((df['selftext'] == '[deleted]') | (df['selftext'] == '[removed]'))]
    df['text'] = df['title'] + ' ' + df['selftext']
    
    #reset the index
    df.reset_index(inplace=True)
    df.drop(columns = 'index', inplace=True)    
    
    return df

In [14]:
# https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72
def remove_accented_chars(text):
    '''
    Remove accents from characters
    '''
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# remove_accented_chars('Sómě Áccěntěd těxt')
remove_accented_chars('naïve')

'naive'

In [15]:
# code taken and modified from https://github.com/hundredblocks/concrete_NLP_tutorial/blob/master/NLP_notebook.ipynb
def standardize_text(df, text_field):
    '''
    Clean model text using regular expressions
    '''
    #convert to lowercase
    #df[text_field] = df[text_field].str.lower()
    
    #remove accented characters
    df[text_field] = df[text_field].map(remove_accented_chars)
    
    #unesscape html characters
    df[text_field] = df[text_field].map(unescape)
    
    df[text_field] = df[text_field].str.replace(r"http\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"www\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "", regex=True)
    df[text_field] = df[text_field].str.replace(r"\n\n+", "\n", regex=True)
    
    df[text_field] = df[text_field].str.replace(r"[‘’]", "'", regex=True)

    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),.!?@'`\"_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    
    # expand some acronyms
    df[text_field] = df[text_field].str.replace(r"\bdae\b", "does anyone else", regex=True)
    ## TODO: NEED TO INCLUDE doctor terms in the text; Should help with model performance.
    ## df[text_field] = df[text_field].str.replace(r"\b(doctor|doc|dr|dr\.)\b", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"\bgad\b", "generalized anxiety disorder", regex=True)

    return df

In [16]:
def assign_ranks(grp):
    n_rows = grp.shape[0]
    grp["rank"] = grp["rnd"].rank()
    grp["trt"] = (grp["rank"] - 1).astype(int) // (n_rows / 2)
    grp["trt"] = grp["trt"].map({0: 'train', 1: 'test'})
    return grp

In [17]:
def create_clean_text(csv_file):
    df = pd.read_csv(csv_file, index_col=0)
    df.rename(columns = {'author_flair_text': 'flair_text'}, inplace=True)
    df = clean_df(df)
    df = standardize_text(df, 'text')
    df = df[['text']]
    clean_file = csv_file.split('.csv')[0] +'_clean.csv'
    df.to_csv(clean_file, index=False)
    return df

### Testing

In [18]:
# filename = './Data/AskDocs_Oct_2020.csv'
# df = pd.read_csv(filename)
# df.rename(columns = {'author_flair_text': 'flair_text'}, inplace=True)
# df = clean_df(df)
# df['text'].loc[0]
# # df = create_clean_text(filename)

In [19]:
# df = standardize_text(df, 'text')
# df['text'].loc[0]

In [26]:
def create_combined_df(months):
    df_list = []
    for month in months:
        df = pd.read_csv('./data/askdocs_csv/AskDocs_' + month + '_2021_clean.csv')
        df['month'] = month
        df_list.append(df)
    df_combined = pd.concat(df_list)
    return df_combined

In [21]:
def select_from_df(df_combined, grp_name, n_start, n_stop, months):
    df_grp = []
    for month in months:
        df = df_combined[(df_combined['month'] == month) & (df_combined['trt'] == grp_name)][n_start:n_stop]
        df_grp.append(df)
    df_grp = pd.concat(df_grp)
    return df_grp[['text']]

### Initialize basic arguments, Clean Original dataset and save to files

In [38]:
# Set seed
np.random.seed(42)
months = ['May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct']
n_start = 40
n_end = 80 #40
# number of months * number of examples from each month (n_end  - n_start); n_end excluded
model_suffix = len(months) * (n_end - n_start)
sub_group = 'test'

### Annotated files generated from Doccano & rehearsal data notebook

In [29]:
# SET this flag to False if not using SYMPTOM as part of NER model
keep_symptom_ent = False

if not keep_symptom_ent:
    model_suffix = str(model_suffix) + '_NS_v2'

# final annotated output from Doccano
train_filename = './data/json/nsamples_480_v2_2021_6m_doccano.jsonl' #'./nsamples_240_2021_6m_doccano.jsonl'
val_filename = './data/json/val_nsamples_240_doccano.jsonl'

# annotated rehearsal filenames from generate_rehearsal_data.ipynb
rehearsal_train_filename = './data/json/nlp_rehearsal_1000.json'
rehearsal_val_filename = './data/json/test_nlp_rehearsal_1000.json'

old_ptrns_fname = f'./data/patterns/old_patterns_240.csv' #f'./old_patterns_60.csv'
new_ptrns_fname = f'./data/patterns/old_patterns_{model_suffix}.csv'

### Read in original csv files, clean, save

In [30]:
for month in months:
    df = create_clean_text(f'./data/askdocs_csv/AskDocs_{month}_2021.csv')

### Read in cleaned csv files, concatenate into single df

In [31]:
df_combined = create_combined_df(months)
df_combined.shape

(82680, 2)

### Shuffle randomly, split each month into 2 parts: train, test

In [32]:
df_combined['rnd'] = np.random.rand(df_combined.shape[0])
df_combined = df_combined.groupby("month").apply(assign_ranks)

### Select data within subgroup (train / test) from combined dataset

In [33]:
df_clean_text = select_from_df(df_combined, sub_group, n_start, n_end, months)

In [34]:
df_clean_text.shape

(240, 1)

### IF NOT PRE-ANNOTATING: Write cleaned text to .JSONL for import into Doccano for annotation:

In [34]:
# export data as jsonl file for import into doccano for annotation
df_clean_text.to_json(f'./outputs/{sub_group}_n_{model_suffix}.json', orient='records', lines=True)

## Pre-Annotate text using existing patterns before importing into Doccano

### Use existing identified patterns (MODEL 0) to pre-annotate data before importing in Doccano

In [35]:
# load entity_ruler model created using patterns from all the training data so far
model_0 = spacy.load('./models/model_0_n_480_NS/')

In [36]:
#generate pre-annotated test data based on existing annotation rules
pre_annotated_eg = []
for eg in df_clean_text['text']:
    dct = {}
    doc = model_0(eg)
    dct['text'] = doc.text
    l = []
    if (len(doc.ents)):
        for ent in doc.ents:
            #print (ent, ent.start_char, ent.end_char, ent.label_)
            l.append([ent.start_char, ent.end_char, ent.label_])
    dct['label'] = l
    pre_annotated_eg.append(dct)

In [39]:
#convert to dataframe
pre_annotated_eg_df = pd.DataFrame(pre_annotated_eg)
# save in json format for import into doccano
pre_annotated_eg_df.to_json(f'./outputs/pre_annotated_val_240_480.json', orient='records', lines=True)