## COVID19 related papers that published during the first week of May, 2020

In [1]:
import pandas as pd
import numpy as np
import os
import json
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import parse_cord as cord

[nltk_data] Downloading package punkt to /Users/elif/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

In [3]:
# For Notebooks
init_notebook_mode(connected=True)

# For offline use
cf.go_offline()

### Define a global variable for the folder name that contains the json files of the papers

In [4]:
DATA_DIR = 'data/archive'

Call the module to read in the data

In [5]:
# New version from module
data_frame = cord.read_json_files(cord.json_files(DATA_DIR), DATA_DIR)



### Extracting sentences that contain symptoms

Defining the symptoms that we will extract from our sentences. The list was taken from: https://www.kaggle.com/davidbetancur8/symptoms-word-cloud

The list was modified by Max to have more up to date symptoms and I added in the following symptoms which were taken from CDC's website or I observed them frequintly in the papers:

    -"difficulty breathing"
    -"muscle ache"
    -"congestion"
    -"runny nose"
    -"trouble breathing"
    -"persistent pain"
    -"pressure in the chest"
    -"inability to wake"
    -"stay awake"
    -"bluish lips"
    -"bluish face"
    -"fevers"
    -"decreased appetite"

In [6]:
symptoms = [
    "weight loss","chills","shivering","convulsions","deformity","discharge","dizziness", "lymphopenia", "sneezing",
    "vertigo","fatigue","malaise","asthenia","hypothermia","jaundice","muscle weakness", "chest discomfort",
    "pyrexia","sweats","swelling","swollen","painful lymph node","weight gain","arrhythmia", "loss of smell", 
    "loss of appetite", "loss of taste", "bradycardia","chest pain","claudication","palpitations","tachycardia",
    "dry mouth","epistaxis", "dysgeusia", "hypersomnia", "taste loss", "halitosis","hearing loss","nasal discharge", 
    "nasal inflammation", "otalgia","otorrhea","sore throat","toothache","tinnitus", "dysphonia",
    "trismus","abdominal pain","fever","bloating","belching","bleeding","bloody stool","melena","hematochezia", 
    "burning sensation in the chest", "constipation","diarrhea","dysphagia","dyspepsia","fecal incontinence",
    "flatulence", "heartburn", "chest tightness", "chest pressure","nausea","odynophagia","proctalgia fugax",
    "pyrosis","steatorrhea","vomiting","alopecia","hirsutism", "tachypnoea", "nasal obstruction",
    "hypertrichosis","abrasion","anasarca","bleeding into skin","petechia","purpura","ecchymosis", "bruising", 
    "blister","edema","itching","laceration","rash","urticaria","abnormal posturing","acalculia","agnosia","alexia",
    "amnesia","anomia","anosognosia","aphasia","apraxia","ataxia","cataplexy","confusion","dysarthria", 
    "nasal congestion","dysdiadochokinesia","dysgraphia","hallucination","headache","akinesia","bradykinesia",
    "ballismus","blepharospasm","chorea","dystonia","fasciculation","muscle cramps","myoclonus","opsoclonus",
    "tremor","flapping tremor","insomnia","loss of consciousness","syncope","neck stiffness","opisthotonus",
    "paralysis","paresis","paresthesia","prosopagnosia","somnolence","abnormal vaginal bleeding", "neuralgia",
    "vaginal bleeding in early pregnancy", "miscarriage","vaginal bleeding in late pregnancy","amenorrhea", "body aches",
    "infertility","painful intercourse","pelvic pain","vaginal discharge","amaurosis fugax","amaurosis", "skin lesions",
    "blurred vision","double vision","exophthalmos","mydriasis","miosis","nystagmus","amusia","anhedonia",
    "anxiety","apathy","confabulation","depression","delusion","euphoria","homicidal ideation","irritability",
    "mania","paranoid ideation","suicidal ideation","apnea","hypopnea","cough","dyspnea","bradypnea","tachypnea",
    "orthopnea","platypnea","trepopnea","hemoptysis","pleuritic chest pain","sputum production","arthralgia",
    "back pain","sciatica","urologic","dysuria","hematospermia","hematuria","impotence","polyuria",
    "retrograde ejaculation","strangury","urethral discharge","urinary frequency","urinary incontinence", 
    "anosmia", "myalgia", "rhinorrhea", "shortness of breath", "difficulty breathing", "muscle ache", "congestion",
    "runny nose", "trouble breathing", "persistent pain", "pressure in the chest", "inability to wake", "stay awake",
    "bluish lips", "bluish face","akathisia","athetosis", "urinary retention", "fevers", 
    "decreased appetite"]

In [7]:
len(symptoms)

209

## Check the papers that contain the words in  our list of symptoms and then extract those sentences only

### 2. Encountered problem: Not being able to use the data frame with title column

I had to create a seperate data frame without the title column to work on as it kept getting mixed with the full text when using nltk.tokenize. 

I spent quite long time on this figure out why it was happening therefore, decided to use the data frame with paper id and fulltext only.


In [8]:
df_no_title = data_frame[['paper_id', 'full_text']]

## nltk.tokenize

     I used nltk.tokenize to split the full text into one sentence per row - Initial solutoin
     - I changed the code to regular expression to split the sentences which seem to be performing better.

In [13]:
def split_sentences(df):
    '''
    Takes in a data frame where there are multiple sentences in one row and splits them into one row per sentence
    
    :param df: data frame that we would like to split
    :return a data frame with one sentence per row and what paper that sentence belongs to
    
    '''
    sentences = []
    sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])' #splittng criteria 

    for row in df.itertuples():            
         for sentence in re.split(sentence_pattern, row[2]):
                sentences.append((row[1], sentence))

    new_df = pd.DataFrame(sentences, columns=['Paper_Id', 'Sentence'])
    return new_df

In [14]:
#Data frame that has one sentence per row 
new_df = split_sentences(df_no_title)

In [15]:
new_df.head()

Unnamed: 0,Paper_Id,Sentence
0,b72d2b2b61a0334faef085bdae3262394a1742d2,"During a virtual news conference in Geneva, Sw..."
1,b72d2b2b61a0334faef085bdae3262394a1742d2,Cantrell and 212 other mayors sounded the alar...
2,b72d2b2b61a0334faef085bdae3262394a1742d2,Some healthcare workers have resorted to searc...
3,b72d2b2b61a0334faef085bdae3262394a1742d2,"From a warehouse in Baltimore, Maryland, owned..."
4,b72d2b2b61a0334faef085bdae3262394a1742d2,What can be washed and reused?



Added in the Sentence ID column to keep track of which sentences will be retrieved when we check for the symptoms in each sentence. Started the id values from 1 thus the increment of 1 on the existing data frame index.

In [16]:
new_df['Sentence_ID'] = new_df.index + 1

### Final data frame that only includes sentences from each text that contains any of the symptoms from our list of symptoms

In [17]:
def sentence_w_symptoms(df, sym):
    '''
    Takes in a data frame and list of symptoms, returns a new data frame with sentences that include any of the symptoms
    
    :param df: data frame with multiple senteces per row
    :param sym: the symtoms that we would like to check in the data frame
    :return data frame of sentences that include one or more of the symptoms 
    
    '''
    # match beginning of words and end of words for symptoms, not partial
    pattern = '\\b' + '\\b|\\b'.join(sym) + '\\b'
    final_df = df[df['Sentence'].str.contains(pattern, flags = re.IGNORECASE)].copy()
    return final_df


In [9]:
import split_text as sp

final_df = sp.sentence_w_symptoms(sp.split_sentences(df_no_title), symptoms)

In [10]:
final_df['Sentence'].head()

96     Subjects who died in hospital were significant...
202    We retrospectively analyzed medical charts of ...
203    The patient first had 4-5 episodes of watery d...
205    However, she returned to the ED the next day w...
217    The patient reported an overall improvement in...
Name: Sentence, dtype: object

### Rearranged the display of the columns

In [20]:
columnsTitles = ['Paper_Id', 'Sentence_ID', 'Sentence']

covid_df = final_df.reindex(columns=columnsTitles)

## Characteristics of data

Create a program that will take a csv file and report:
 - number of sentences
 - number of sentences with terms
 - individual term counts (how many times each term appears
 - given a collection of symptom terms X in a csv file, for each term in X, its count in the collection, sorted in descending order

## Function that produces total number of sentences, papers and the ones that include our symptoms 

In [21]:
def char_of_data(df):
    '''
    functiono produces data characteristics such as number of sentences in the data frame, how many of those have 
    any of the symptoms and how many times each smyptom accour in these sentences. 
    
    :param df: data frame that includes the sentence and paper_id for our data
    returns sorted in descending order of the symptoms and the summary 
    '''
    print('Total number of papers published in the month of May', len(new_df['Paper_Id'].unique()))
    print('Total number of sentences from the papers that are published in the given time frame is', len(new_df['Sentence']))
    print('Total number of sentences in the final data frame with symptoms is', len(df['Sentence']))
    print('Total number of unique papers in the final data frame is', len(df['Paper_Id'].unique()))


In [22]:
char_of_data(covid_df)

Total number of papers published in the month of May 12228
Total number of sentences from the papers that are published in the given time frame is 917341
Total number of sentences in the final data frame with symptoms is 27265
Total number of unique papers in the final data frame is 5970


## Function that produces the count of symptoms in the given data frame

In [23]:
def symptoms_df(df_sym, symp = symptoms):
    
    df_test= df_sym.Sentence.str.extractall('({})'.format('|'.join(symp)), flags = re.IGNORECASE)\
                           .iloc[:, 0].str.get_dummies().sum(level=0)
    sum_column = df_test.sum(axis=0)

    df_symptom_count = pd.DataFrame({'Symptoms':sum_column.index, 'Counts':sum_column.values})
    df_symptom_count['Symptoms'] = df_symptom_count['Symptoms'].str.lower()
    sym_df = df_symptom_count.groupby('Symptoms').sum().sort_values(['Counts'],ascending=False)
    return sym_df

In [24]:
sym_df = symptoms_df(covid_df)

In [25]:
sym_df.head()

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
fever,7145
anxiety,5130
cough,4775
depression,2990
discharge,2601


### Visualization of symptom counts

In [26]:
sym_df.iplot(kind='scatter',y='Counts',mode='markers',size=10)

Coonver symptoms to dictionary with their counts

In [27]:
sym_df.to_dict()

{'Counts': {'fever': 7145,
  'anxiety': 5130,
  'cough': 4775,
  'depression': 2990,
  'discharge': 2601,
  'diarrhea': 1582,
  'fatigue': 1393,
  'dyspnea': 1358,
  'bleeding': 1207,
  'headache': 1138,
  'edema': 1092,
  'lymphopenia': 1048,
  'shortness of breath': 1026,
  'anosmia': 945,
  'vomiting': 909,
  'confusion': 787,
  'nausea': 733,
  'urologic': 727,
  'myalgia': 721,
  'sore throat': 651,
  'rash': 585,
  'sneezing': 501,
  'abdominal pain': 490,
  'arrhythmia': 457,
  'congestion': 440,
  'chest pain': 409,
  'weight loss': 398,
  'tachycardia': 325,
  'insomnia': 294,
  'dizziness': 245,
  'swelling': 238,
  'dysgeusia': 222,
  'chills': 222,
  'loss of smell': 214,
  'urticaria': 207,
  'malaise': 200,
  'paralysis': 185,
  'dysphagia': 175,
  'rhinorrhea': 174,
  'nasal congestion': 158,
  'ataxia': 154,
  'suicidal ideation': 153,
  'skin lesions': 151,
  'apnea': 144,
  'weight gain': 143,
  'loss of taste': 142,
  'runny nose': 129,
  'bradycardia': 127,
  'nasal

### List of symptoms that appeared in the published papers that were extracted with our list of symptoms

In [28]:
counted_symptoms = sym_df.index.to_list()

In [29]:
# counted_symptoms.sort()
# print(counted_symptoms)

In [30]:
# symptoms.sort()
# print(symptoms)

### Extracting the symptoms that are present in our original list of symptoms but were not found in the papers

In [31]:
main_list = np.setdiff1d(symptoms,counted_symptoms)
# yields the elements in `list_2` that are NOT in `list_1`
len(main_list)

28

It appears that that are 28 symptoms on our original symptoms list that do not occur in the papers

In [32]:
df_test1= covid_df.Sentence.str.extractall('({})'.format('|'.join(main_list)), flags = re.IGNORECASE)\
                           .iloc[:, 0].str.get_dummies().sum(level=0)

### 3. Encountered problem: 'fevers' symptom appears on the list of symptoms that does not occur in the papers. 

I am not able to find out why. It is clear that it does appear over 200 times

In [33]:
print(df_test1.sum(axis=0))

Fevers      4
fevers    192
dtype: int64


In [34]:
#verifies that those 27 symptoms do not appear in any of the papers except fevers symptom
# for symptom in main_list:
#      print(symptom, len(covid_df[covid_df.Sentence.str.contains(symptom) == True]))

### Save the sentences in a csv file where there are 3 columns: paper ID, sentence ID, and sentence text to be our testing data.

I did not save the indexes as it would create multiple index columns when we reread in the file.

In [35]:
covid_df.to_csv('/Users/elif/Desktop/covid_testing_data_May.csv', index = False, encoding= 'utf-8')

###   Create another spreadsheet with three columns: sentence #, word, and tag. 


    Place one word in a row and label symptoms words in the tag column: 
    mark the beginning (B-Sym) and inside (I-Sym) of each symptom term. 
    If a term consists of only one word, simply mark it as B-Sym with no I-Sym.  
    Label all other words as O. 

In [36]:
def repl(m):
    '''
    regular expression function that replaces symtoms with BSYM if .group(0) is one word
    or BSYM + ISYM if multiple words - this function looks for spaces in the given words
    
    m.group(0) - firts match
    '''
    return ' '.join(['BSYM']+['ISYM']*(m.group(0).count(' ')))

In [39]:
pattern = '\\b' + '\\b|\\b'.join(symptoms) + '\\b'
covid_df['Token'] = covid_df['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)

Token column is the duplicate of the Sentence column. The only difference is that I replaced the existing symptoms in each sentence with BSYM or ISYM based on our regular expression function. 

re.IGNORECASE is used to ignore the words that are upper case and treat them all as lower case. This way, I was able to retain the original version of sentences without using .lower().

In [40]:
covid_df.head()

Unnamed: 0,Paper_Id,Sentence_ID,Sentence,Token
96,84d22b71f6df277a11824433ccf14137303f55f5,97,Subjects who died in hospital were significant...,Subjects who died in hospital were significant...
202,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,203,We retrospectively analyzed medical charts of ...,We retrospectively analyzed medical charts of ...
203,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,204,The patient first had 4-5 episodes of watery d...,The patient first had 4-5 episodes of watery B...
205,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,206,"However, she returned to the ED the next day w...","However, she returned to the ED the next day w..."
217,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,218,The patient reported an overall improvement in...,The patient reported an overall improvement in...


Reset the index column to start from 0 and assigned the Sentece number to the Sentence_ID column 

In [41]:
covid_df.reset_index(drop=True, inplace=True)

In [42]:
covid_df['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(covid_df) + 1)]

### Tokenize sentences

In this part of the task, I tokenized words and tokens columns and then replaces the tokens
with the actual display of B-SYM and I-SYM with the dashes added in the middle.

Then, provided a dataframe with three columns only

In [43]:
def tokenize_sentences(frame):
    words = []
    i = 0
    for j, row in frame.iterrows():
        for word, temptag in zip(word_tokenize(row['Sentence']), word_tokenize(row['Token'])):
    #         print(word, temptag)
            if temptag == 'BSYM':
                tag = 'B-SYM'
            elif temptag == 'ISYM':
                tag = 'I-SYM'
            else:
                tag = 'O'
            words.append((row['Sentence_ID'], word, tag))

    tag_df = pd.DataFrame(words, columns=['Sentence_ID', 'Words', 'Tag'])
    return tag_df

In [44]:
tag_df = tokenize_sentences(covid_df)

In [45]:
# tag_df.head(25)

Double ckecing the tag column to see if we were able to tag I-SYM symptom words

In [46]:
tag_df[tag_df['Tag']== 'I-SYM'].head(5)

Unnamed: 0,Sentence_ID,Words,Tag
181,Sentence #3,pain,I-SYM
213,Sentence #4,pain,I-SYM
223,Sentence #4,appetite,I-SYM
271,Sentence #5,pain,I-SYM
285,Sentence #6,chest,I-SYM


#### This part of the code is for sanity checking on our sentences that contain multi word symtomps. 

    Firts extract only the symptoms with multi words
    Then use our pattern to check which sentences have multi word symptoms presents
    Export them to csv file, if needed

In [47]:
# #Extract the list of multi symptom words
# multi_word_symptoms = ([symptom for symptom in symptoms if len(symptom.split())>1])

# #pattern to check for words that have space before and after 
# #this is to prevent our search from getting distracted by words like 'Statistics' when we are
# #looking for 'tic' which is a problem that I encountered.
# pattern_multi = '\\b' + '\\b|\\b'.join(multi_word_symptoms) + '\\b'

# #Get the dataframe that contains multi words
# covid_df[covid_df['Sentence'].str.contains(pattern_multi)]

# #Export them to a csv file if needed
# #multi_word_sentences.to_csv('/Users/elif/Desktop/multi_word_sentences.csv', index = False)

###  Here the sentence # should be consecutive integers (similar to a surrogate key in a database table) and is not the same as sentence ID in the first spreadsheet.

    Styling of sentence numbers ~ instead of duplicating the same number over and over again, we leave out the duplicated ones by replacing them with space

In [48]:
#tag_df['Sentence_ID'] = tag_df['Sentence']

In [49]:
def remove_duplicate_sentence_ids(df):
    '''
    Takes in the data frame where the same sentence numbers are repeated multiple times and returns a df
    that repeats sentence number only once for the entire row of words that it has
    
    '''
    is_duplicate = df['Sentence_ID'].duplicated()

    df['Sentence_ID'] = df['Sentence_ID'].where(~is_duplicate, ' ')
    tagged_data = df[['Sentence_ID', 'Words', 'Tag']]
    return tagged_data

In [50]:
tagged_data = remove_duplicate_sentence_ids(tag_df)

In [51]:
tagged_data.head()

Unnamed: 0,Sentence_ID,Words,Tag
0,Sentence #1,Subjects,O
1,,who,O
2,,died,O
3,,in,O
4,,hospital,O


In [52]:
#Final tagged covid data
tagged_data.to_csv('/Users/elif/Desktop/covid_tagged_data_May.csv', index = False, encoding= 'utf-8')




## Working with Colloquial datasets

    Read in the existing 2 data sets and analyze the number of symptoms on each papaer and create a one merged list of symptoms from both data sets. 

In [53]:
paSi_df = pd.read_excel('/Users/elif/Downloads/OneDrive_1_10-29-2020/patient-site_lableled.xlsx', sheet_name='in', usecols="A,B,C")

In [54]:
plm_df = pd.read_excel('/Users/elif/Downloads/OneDrive_1_10-29-2020/plm_dataset_labeled.xlsx', sheet_name='plm_dataset', usecols="A,B,C")

In [55]:
def symAndCount(dataframe):
    values_dict = {}
    sym_str = ''
    for i,row in enumerate(dataframe.values):
    
        if row[2]=='B-SYM'and dataframe.values[i+1][2] == 'O':
            if row[1].lower() in values_dict:
                values_dict[row[1].lower()]+=1
            else:
                values_dict[row[1].lower()] = 1
        if (row[2]=='B-SYM'and dataframe.values[i+1][2] == 'I-SYM') or row[2]=='I-SYM':
            sym_str +=str(row[1]).lower()+' '
            if dataframe.values[i+1][2] == 'O':
                sym_str = sym_str.strip()
                if sym_str in values_dict:
                    values_dict[sym_str]+=1
                else:
                    values_dict[sym_str] = 1
                sym_str = ''
    return values_dict


In [56]:
from collections import Counter

plm_sym_dict = symAndCount(plm_df)
paSi_sym_dict = symAndCount(paSi_df)

new_dict = dict(Counter(plm_sym_dict) + Counter(paSi_sym_dict))

In [57]:
#Sort the dictionary of symptoms along with their value counts
import operator
sorted_d = dict( sorted(new_dict.items(), key=operator.itemgetter(1),reverse=True))
# sorted_d

In [58]:
df_sym_freq = pd.DataFrame(list((dict(new_dict)).items()), columns = ['Symptom', 'Frequency'])
df_sym_freq.to_csv('sym_freq.csv',index=False)
only_syms = list((dict(new_dict)).keys())

In [59]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [60]:
#Wordcloud of symptoms 

# wc = WordCloud(background_color = "black", width = 1000, height= 1000).generate_from_frequencies(new_dict)
# fig = plt.figure(figsize = (15,15))
# plt.imshow(wc, interpolation = "bilinear")
# plt.axis("off")
# plt.show()

In [61]:
df_sym_freq.groupby('Symptom').sum().sort_values(['Frequency'],ascending=False).head()

Unnamed: 0_level_0,Frequency
Symptom,Unnamed: 1_level_1
cough,41
fever,32
sore throat,16
fatigue,15
headache,15


In [62]:
# only_syms
print('Total number of symptoms retrived from both data sets is:', len(only_syms))

Total number of symptoms retrived from both data sets is: 400


In [63]:
# yields the elements in `list_2` that are NOT in `list_1`
final_list = np.setdiff1d(symptoms,only_syms)
print('Total number of symptoms that are retrived from both data sets that do not exist in the original symptom list is:', len(final_list))

Total number of symptoms that are retrived from both data sets that do not exist in the original symptom list is: 177


In [64]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

In [65]:
print('Total number of symptoms that appear in both the original list and the list created from colloquial data sets is:', len(intersection(symptoms, only_syms)))

Total number of symptoms that appear in both the original list and the list created from colloquial data sets is: 32


### Create a version of the COLL-DATA that only contains *sentences* with symptom terms

In [66]:
def colloqual_data_processing(tagged_data, symptom_list):
    
    '''
    Takes in the tagged data with 3 columns and the list of symptoms to search from,
    joins them back together to do a search of sentences that include any of the symptoms, 
    eliminates the sentences that have 1 or less than 1 character, then splits those sentences into words
    and tokenizes them 
    
    return data frame that includes only the sentences that mention one or more of the symptoms
    
    '''
    #Replace NaN values with emtpy spaces to match the original format 
    tagged_data['Sentence'] = tagged_data['Sentence'].replace(np.nan, ' ')
    
    #Take in the data frame and create a list of sentences
    sent_value = []
    sentence = []
    for name, values in tagged_data.iterrows():

        #print(values)
        if values['Sentence'].startswith('Sentence') == True:
            sent_value.append(sentence)
            sentence = []

        sentence.append(str(values['Words']))        
    
    #Put the full list together to have the full text to extract only the sentences that include symptoms 

    sentence_list = []
    for sentence in sent_value:
        if len(sentence) <= 1: #remove the sentences that have . or space as the only character 
            continue

        full_txt = " ".join(sentence).strip()
        sentence_list.append(str(full_txt))
    #join the sentences
    full_Sent = " ".join(sentence_list)

    #create a dataframe with the full text of words
    d = {'Sentence' : [full_Sent]}
    col_df = pd.DataFrame(data=d)
    
    #Tokenize the sentences to have one sentence per row 
    sentences = []
    sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])'
    for row in col_df.itertuples():            
         for sentence in re.split(sentence_pattern, row[1]):
                sentences.append((row[0], sentence))
                

    collo_df = pd.DataFrame(sentences, columns=['Index','Sentence'])
#     print(collo_df.head())
    
    collo_df.drop('Index', axis = 1, inplace = True)
    
    #Create a pattern to check for sentences that contain the symtoms 
    new_pattern = '\\b' + '\\b|\\b'.join(symptom_list) + '\\b'
    colloqual_df = collo_df[collo_df['Sentence'].str.contains(new_pattern, flags = re.IGNORECASE)].copy()
#     print(colloqual_df.head())

    return sent_value, colloqual_df


In [92]:
plm_df.head()

Unnamed: 0,Sentence,Words,Tag
0,Sentence #1,I,O
1,,was,O
2,,diagnosed,O
3,,3,O
4,,days,O


In [67]:
sent_value, colloqual_df = colloqual_data_processing(plm_df, only_syms)

In [68]:
def colloqual_df_info(sent, df):
    print('Total number of sentences in this file is:', len(sent))
    print('Number of sentences that include any of the provided symtoms is:', len(df))
    print('Number of sentences that do NOT include any of the provided symptoms is:', (len(sent) - len(df)))
    
    

In [69]:
colloqual_df_info(sent_value, colloqual_df)

Total number of sentences in this file is: 531
Number of sentences that include any of the provided symtoms is: 168
Number of sentences that do NOT include any of the provided symptoms is: 363


In [70]:
sent_value_, colloqual_df_n = colloqual_data_processing(paSi_df, only_syms)

In [71]:
colloqual_df_info(sent_value_, colloqual_df_n)

Total number of sentences in this file is: 853
Number of sentences that include any of the provided symtoms is: 301
Number of sentences that do NOT include any of the provided symptoms is: 552


## Put the colloquial data prior to Nov5 sentences into tagged format

In [72]:
colloqual_df.reset_index(drop=True, inplace=True)

In [73]:
plm_peS_df = sentence_w_symptoms(colloqual_df, symptoms)

In [74]:
plm_peS_df['Token'] = plm_peS_df['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)
plm_peS_df['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(plm_peS_df) + 1)]


In [75]:
df5 = tokenize_sentences(plm_peS_df)
plm_peS_df_out=remove_duplicate_sentence_ids(df5)

In [76]:
plm_peS_df_out=remove_duplicate_sentence_ids(df5)

In [77]:
plm_peS_df_out.to_csv('/Users/elif/Desktop/Existing_Colloquial_data_prior_Nov5.csv', index = False, encoding= 'utf-8')

## Scraped Data from the sites

    Created one csv file where all the text data fromo both sites is stored
    Loaded the csv file to select the sentences that include symptoms
    Output the tagged file for symptoms 

In [78]:
scraped_data = pd.read_csv('/Users/elif/Downloads/combined_csv-4.csv')

In [79]:
#Count of original symptoms
symptoms_df(scraped_data)#, symp = symptoms)

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
cough,61
fever,49
headache,33
fatigue,29
anxiety,28
sore throat,17
shortness of breath,16
dizziness,12
vertigo,10
congestion,9


In [80]:
len(symptoms_df(scraped_data))

43

In [81]:
#count of symptomps from the patience like me files
symptoms_df(scraped_data, only_syms)

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
symptoms,146
feeling,63
breath,49
cough,46
taste,43
...,...
hives,1
high temperatures,1
hayfever,1
get blurry,1


In [89]:
#Make sure there is only one sentence per row
sentences = []
sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])'
for row in scraped_data.itertuples():            
    for sentence in re.split(sentence_pattern, row[1]):
        sentences.append((row[0], sentence))
                

collo_df_new = pd.DataFrame(sentences, columns=['Index','Sentence'])

collo_df_new.drop('Index', axis = 1, inplace = True)
print('Number of sentences retrived from the sites for the past 3 months is: ', len(collo_df_new))

Number of sentences retrived from the sites for the past 3 months is:  1292


In [91]:
colloqual_new_data = sentence_w_symptoms(collo_df_new, symptoms)
print('Of those sentences ', len(colloqual_new_data), 'of them include our original Symptoms')

Of those sentences  203 of them include our original Symptoms


In [90]:
colloqual_new_data_only_symp = sentence_w_symptoms(collo_df_new, only_syms)
print('Of those sentences ', len(colloqual_new_data_only_symp), 'of them include Symptoms collected from previous colloquial datasets.')

Of those sentences  471 of them include Symptoms collected from previous colloquial datasets.


In [84]:
colloqual_new_data['Token'] = colloqual_new_data['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)
colloqual_new_data['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(colloqual_new_data) + 1)]
df6 = tokenize_sentences(colloqual_new_data)
tagged_scraped_data = remove_duplicate_sentence_ids(df6)

In [85]:
tagged_scraped_data.to_csv('/Users/elif/Desktop/New_Colloquial_data_Nov5.csv', index = False, encoding= 'utf-8')

## Things to address - Nov 6 Meeting

    - Changed the way we split sentences into one sentence per row format, thus, we now have a different testing set for the MAY papers which is saved in MAX&ELIF folder
    
    - The existing tagged colloquial data sets were split in a different fashion thus the quality of it might be different that the other files - such as the split of I've or numbers split in different cells
    
    - There were not many posts on those 2 sites that were covid related as they had many posts about just updates on recent changes in the vaccine process
    
    - Scraped data was saved in different csv files and we merged them into one csv file to capture all the information 
    
    - Some of the posts on the site had the edit date as - 2 weeks or months - but some of the comments dated back to 4-5 months ago so not sure about what part of that post is actually recently edited
    
    - The sites do not have acrtual time stamps on the posts thus, I went back 3 months to collect all the information that we have right now.