<a href="https://colab.research.google.com/github/elif-tr/COVID19-Text-Processing/blob/main/Covid19_Project_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## COVID19 related papers that published during the first week of May, 2020

In [None]:
import pandas as pd
import numpy as np
import os
import json
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [None]:
import matplotlib.pyplot as plt
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf

In [None]:
# For Notebooks
init_notebook_mode(connected=True)

# For offline use
cf.go_offline()

### Define a global variable for the folder name that contains the json files of the papers

In [None]:
DATA_DIR = 'archive'

### Working with meta data file

First read in the meta data file to filter the papers for the time frame we want.

In [None]:
def meta_data(folder_name, metadata = "metadata.csv"):
    ''' Function that takes in the folder name and returns the data fram fro the meta data
    
    :param folder_name: name of folder where covid data is located
    :param metadata: file name for where metadata is saved
    :return: data frame of the metadata
    
    '''
    
    file_name = os.path.join(os.getcwd(),folder_name, metadata )
    data = pd.read_csv((file_name), usecols=['pdf_json_files', 'publish_time'], encoding= 'utf-8')
    return data

In [None]:
def json_files(start_date = '2020-05-01', end_date = '2020-05-31'):
    '''Function that filters the meta data file for a time frame we want. 
    If nothing specified, first week of may will be used
    
    :param start_date: reading in the json files from when they were published 
    :param end_date: reading in the json files until when they were published 
    :return: lisf of json file names that are within the specified publication date
    
    '''
    
    global DATA_DIR
    
    file = meta_data(DATA_DIR)
    file['publication_date'] = pd.to_datetime(file['publish_time'])
    may_first_week = file[(file['publication_date'] > start_date) & (file['publication_date'] <= end_date)]
    
    return list(may_first_week['pdf_json_files'].dropna())

### 1. Encountered Problem: Files that contain multiple papers

I observed that some of the files contain more than one paper which makes it harder for us to read them in individually. For that, we will bring all our json files into same format of containing 1 file per document.

In [None]:
#Those files were separated by ; sign instead of , sign.

all_files = []
for file in json_files():
    all_files.extend(map(str.strip, file.split(";")))


### Extracting only the columns needed for our analysis 

Some of the code was take from: https://www.kaggle.com/davidbetancur8/symptoms-word-cloud

In [None]:
def read_json_files(file_list = all_files):
    
    '''Function that takes in date filtered json files and outputs a data frame with only 3 columns:
    paper_id, title and body text of the paper
    
    :param file_list: list of json files that will be read in by locating in the directory 
    :return: return a data frame of those json files with three columns only "paper_id", "title", "full_text"
    
    '''
    docs = []
    for file in file_list:
        file_name = os.path.join(os.getcwd(),DATA_DIR, file)
        with open(file_name) as f:
            data_json = json.load(f)
                        
            
        title = data_json["metadata"]["title"]
        paper_id = data_json['paper_id']
        

        full_text = ""
        i = 1
        for text in data_json["body_text"]:
            i+=1
            full_text += text["text"]
        docs.append([paper_id, title, full_text])

    df = pd.DataFrame(docs, columns=["paper_id", "title", "full_text"])

    return df
    

### Extracting sentences that contain systoms 

Defining the symptoms that we will extract from our sentences. The list was taken from: https://www.kaggle.com/davidbetancur8/symptoms-word-cloud

The list was modified by Max to have more up to date symptoms and I added in the following symptoms which were taken from CDC's website or I observed them frequintly in the papers:

    -"difficulty breathing"
    -"muscle ache"
    -"congestion"
    -"runny nose"
    -"trouble breathing"
    -"persistent pain"
    -"pressure in the chest"
    -"inability to wake"
    -"stay awake"
    -"bluish lips"
    -"bluish face"
    -"fevers"
    -"decreased appetite"

In [None]:
symptoms = [
    "weight loss","chills","shivering","convulsions","deformity","discharge","dizziness", "lymphopenia", "sneezing",
    "vertigo","fatigue","malaise","asthenia","hypothermia","jaundice","muscle weakness", "chest discomfort",
    "pyrexia","sweats","swelling","swollen","painful lymph node","weight gain","arrhythmia", "loss of smell", 
    "loss of appetite", "loss of taste", "bradycardia","chest pain","claudication","palpitations","tachycardia",
    "dry mouth","epistaxis", "dysgeusia", "hypersomnia", "taste loss", "halitosis","hearing loss","nasal discharge", 
    "nasal inflammation", "otalgia","otorrhea","sore throat","toothache","tinnitus", "dysphonia",
    "trismus","abdominal pain","fever","bloating","belching","bleeding","bloody stool","melena","hematochezia", 
    "burning sensation in the chest", "constipation","diarrhea","dysphagia","dyspepsia","fecal incontinence",
    "flatulence", "heartburn", "chest tightness", "chest pressure","nausea","odynophagia","proctalgia fugax",
    "pyrosis","steatorrhea","vomiting","alopecia","hirsutism", "tachypnoea", "nasal obstruction",
    "hypertrichosis","abrasion","anasarca","bleeding into skin","petechia","purpura","ecchymosis", "bruising", 
    "blister","edema","itching","laceration","rash","urticaria","abnormal posturing","acalculia","agnosia","alexia",
    "amnesia","anomia","anosognosia","aphasia","apraxia","ataxia","cataplexy","confusion","dysarthria", 
    "nasal congestion","dysdiadochokinesia","dysgraphia","hallucination","headache","akinesia","bradykinesia",
    "ballismus","blepharospasm","chorea","dystonia","fasciculation","muscle cramps","myoclonus","opsoclonus",
    "tremor","flapping tremor","insomnia","loss of consciousness","syncope","neck stiffness","opisthotonus",
    "paralysis","paresis","paresthesia","prosopagnosia","somnolence","abnormal vaginal bleeding", "neuralgia",
    "vaginal bleeding in early pregnancy", "miscarriage","vaginal bleeding in late pregnancy","amenorrhea", "body aches",
    "infertility","painful intercourse","pelvic pain","vaginal discharge","amaurosis fugax","amaurosis", "skin lesions",
    "blurred vision","double vision","exophthalmos","mydriasis","miosis","nystagmus","amusia","anhedonia",
    "anxiety","apathy","confabulation","depression","delusion","euphoria","homicidal ideation","irritability",
    "mania","paranoid ideation","suicidal ideation","apnea","hypopnea","cough","dyspnea","bradypnea","tachypnea",
    "orthopnea","platypnea","trepopnea","hemoptysis","pleuritic chest pain","sputum production","arthralgia",
    "back pain","sciatica","urologic","dysuria","hematospermia","hematuria","impotence","polyuria",
    "retrograde ejaculation","strangury","urethral discharge","urinary frequency","urinary incontinence", 
    "anosmia", "myalgia", "rhinorrhea", "shortness of breath", "difficulty breathing", "muscle ache", "congestion",
    "runny nose", "trouble breathing", "persistent pain", "pressure in the chest", "inability to wake", "stay awake",
    "bluish lips", "bluish face","akathisia","athetosis", "urinary retention", "fevers", 
    "decreased appetite"]

In [None]:
len(symptoms)

209

## Check the papers that contain the words in  our list of symptoms and then extract those sentences only

### 2. Encountered problem: Not being able to use the data frame with title column

I had to create a seperate data frame without the title column to work on as it kept getting mixed with the full text when using nltk.tokenize. 

I spent quite long time on this figure out why it was happening therefore, decided to use the data frame with paper id and fulltext only.

In [None]:
data_frame = read_json_files()

In [None]:
df_no_title = data_frame[['paper_id', 'full_text']]

## nltk.tokenize

I used nltk.tokenize to split the full text into one sentence per row

In [None]:
# sentences = []

# m = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', s)
# for row in df_no_title.itertuples():            
#      for sentence in sent_tokenize(row[2]):
#             sentences.append((row[1], sentence))
    
# new_df = pd.DataFrame(sentences, columns=['Paper_Id', 'Sentence'])

In [None]:
def split_sentences(df):
    sentences = []
    sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])'

    for row in df.itertuples():            
         for sentence in re.split(sentence_pattern, row[2]):
                sentences.append((row[1], sentence))

    new_df = pd.DataFrame(sentences, columns=['Paper_Id', 'Sentence'])
    return new_df

In [None]:
new_df = split_sentences(df_no_title)

In [None]:
new_df.head()

Unnamed: 0,Paper_Id,Sentence
0,b72d2b2b61a0334faef085bdae3262394a1742d2,"During a virtual news conference in Geneva, Sw..."
1,b72d2b2b61a0334faef085bdae3262394a1742d2,Cantrell and 212 other mayors sounded the alar...
2,b72d2b2b61a0334faef085bdae3262394a1742d2,Some healthcare workers have resorted to searc...
3,b72d2b2b61a0334faef085bdae3262394a1742d2,"From a warehouse in Baltimore, Maryland, owned..."
4,b72d2b2b61a0334faef085bdae3262394a1742d2,What can be washed and reused?



Added in the Sentence ID column to keep track of which sentences will be retreived when we check for the symptoms in each sentence. Started the id values from 1 thus the increment of 1 on the existing data frame index.

In [None]:
new_df['Sentence_ID'] = new_df.index + 1

### Final data frame that only includes sentences from each text that contains any of the symptoms from our list of symptoms

In [None]:
def sentence_w_symptoms(df, sym):
    # match beginning of words and end of words for symptoms, not partial
    pattern = '\\b' + '\\b|\\b'.join(sym) + '\\b'
    final_df = df[df['Sentence'].str.contains(pattern, flags = re.IGNORECASE)].copy()
    return final_df


In [None]:
final_df = sentence_w_symptoms(new_df, symptoms)

In [None]:
final_df['Sentence'].head()

96     Subjects who died in hospital were significant...
202    We retrospectively analyzed medical charts of ...
203    The patient first had 4-5 episodes of watery d...
205    However, she returned to the ED the next day w...
217    The patient reported an overall improvement in...
Name: Sentence, dtype: object

### Rearranged the display of the columns

In [None]:
columnsTitles = ['Paper_Id', 'Sentence_ID', 'Sentence']

covid_df = final_df.reindex(columns=columnsTitles)

## Characteristics of data

Create a program that will take a csv file and report:
 - number of sentences
 - number of sentences with terms
 - individual term counts (how many times each term appears
 - given a collection of symptom terms X in a csv file, for each term in X, its count in the collection, sorted in descending order

## Function that produces total number of sentences, papers and the ones that include our symptoms 

In [None]:
def char_of_data(df):
    '''
    functiono produces data characteristics such as number of sentences in the data frame, how many of those have 
    any of the symptoms and how many times each smyptom accour in these sentences. 
    
    :param df: data frame that includes the sentence and paper_id for our data
    returns sorted in descending order of the symptoms and the summary 
    '''
    print('Total number of papers published in the month of May', len(new_df['Paper_Id'].unique()))
    print('Total number of sentences from the papers that are published in the given time frame is', len(new_df['Sentence']))
    print('Total number of sentences in the final data frame with symptoms is', len(df['Sentence']))
    print('Total number of unique papers in the final data frame is', len(df['Paper_Id'].unique()))


In [None]:
char_of_data(covid_df)

Total number of papers published in the month of May 12228
Total number of sentences from the papers that are published in the given time frame is 1117912
Total number of sentences in the final data frame with symptoms is 28279
Total number of unique papers in the final data frame is 5970


## Function that produces the count of symptoms in the given data frame

In [None]:
def symptoms_df(df_sym, symp = symptoms):
    
    df_test= df_sym.Sentence.str.extractall('({})'.format('|'.join(symp)), flags = re.IGNORECASE)\
                           .iloc[:, 0].str.get_dummies().sum(level=0)
    sum_column = df_test.sum(axis=0)

    df_symptom_count = pd.DataFrame({'Symptoms':sum_column.index, 'Counts':sum_column.values})
    df_symptom_count['Symptoms'] = df_symptom_count['Symptoms'].str.lower()
    sym_df = df_symptom_count.groupby('Symptoms').sum().sort_values(['Counts'],ascending=False)
    return sym_df

In [None]:
sym_df = symptoms_df(covid_df)

In [None]:
sym_df.head()

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
fever,7145
anxiety,5129
cough,4759
depression,2988
discharge,2560


### Visualization of symptom counts

In [None]:
sym_df.iplot(kind='scatter',y='Counts',mode='markers',size=10)

Coonver symptoms to dictionary with their counts

In [None]:
sym_df.to_dict()

{'Counts': {'fever': 7145,
  'anxiety': 5129,
  'cough': 4759,
  'depression': 2988,
  'discharge': 2560,
  'diarrhea': 1583,
  'fatigue': 1393,
  'dyspnea': 1358,
  'bleeding': 1206,
  'headache': 1133,
  'edema': 1074,
  'lymphopenia': 1048,
  'shortness of breath': 1026,
  'anosmia': 944,
  'vomiting': 909,
  'confusion': 786,
  'nausea': 733,
  'myalgia': 719,
  'sore throat': 651,
  'urologic': 603,
  'rash': 579,
  'sneezing': 501,
  'abdominal pain': 490,
  'congestion': 439,
  'arrhythmia': 415,
  'chest pain': 409,
  'weight loss': 398,
  'tachycardia': 325,
  'insomnia': 294,
  'dizziness': 245,
  'swelling': 238,
  'dysgeusia': 222,
  'chills': 222,
  'loss of smell': 213,
  'urticaria': 205,
  'malaise': 200,
  'paralysis': 185,
  'dysphagia': 175,
  'rhinorrhea': 174,
  'nasal congestion': 158,
  'ataxia': 154,
  'suicidal ideation': 153,
  'skin lesions': 151,
  'apnea': 144,
  'weight gain': 143,
  'loss of taste': 142,
  'runny nose': 129,
  'bradycardia': 127,
  'nasal

### List of symptoms that appeared in the published papers that were extracted with our list of symptoms

In [None]:
counted_symptoms = sym_df.index.to_list()

In [None]:
# counted_symptoms.sort()
# print(counted_symptoms)

In [None]:
# symptoms.sort()
# print(symptoms)

### Extracting the symptoms that are present in our original list of symptoms but were not found in the papers

In [None]:
main_list = np.setdiff1d(symptoms,counted_symptoms)
# yields the elements in `list_2` that are NOT in `list_1`
len(main_list)

28

It appears that that are 28 symptoms on our original symptoms list that do not occur in the papers

In [None]:
df_test1= covid_df.Sentence.str.extractall('({})'.format('|'.join(main_list)), flags = re.IGNORECASE)\
                           .iloc[:, 0].str.get_dummies().sum(level=0)

### 3. Encountered problem: 'fevers' symptom appears on the list of symptoms that does not occur in the papers. 

I am not able to find out why. It is clear that it does appear over 200 times

In [None]:
print(df_test1.sum(axis=0))

Fevers      4
fevers    192
dtype: int64


In [None]:
for symptom in main_list:
     print(symptom, len(covid_df[covid_df.Sentence.str.contains(symptom) == True]))

abnormal posturing 0
abnormal vaginal bleeding 0
acalculia 0
agnosia 0
alexia 0
amusia 0
athetosis 0
ballismus 0
bleeding into skin 0
bluish face 0
cataplexy 0
confabulation 0
fecal incontinence 0
fevers 189
flapping tremor 0
hematospermia 0
homicidal ideation 0
hypertrichosis 0
opsoclonus 0
painful intercourse 0
painful lymph node 0
proctalgia fugax 0
prosopagnosia 0
pyrosis 0
strangury 0
trepopnea 0
vaginal bleeding in early pregnancy 0
vaginal bleeding in late pregnancy 0


In [None]:
# word counts in the papers if we need them 

# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
# vectorizer.fit(covid_df.Sentence)
# dict_voca = vectorizer.vocabulary_

In [None]:
# sort_orders = sorted(dict_voca.items(), key=lambda x: x[1], reverse=True)

# for i in sort_orders:
#     print(i[0], i[1])

In [None]:
# sort_orders

### Save the sentences in a csv file where there are 3 columns: paper ID, sentence ID, and sentence text to be our testing data.

I did not save the indexes as it would create multiple index columns when we reread in the file.

In [None]:
covid_df.to_csv('/Users/elif/Desktop/covid_testing_data_May.csv', index = False, encoding= 'utf-8')

###   Create another spreadsheet with three columns: sentence #, word, and tag. 


    Place one word in a row and label symptoms words in the tag column: 
    mark the beginning (B-Sym) and inside (I-Sym) of each symptom term. 
    If a term consists of only one word, simply mark it as B-Sym with no I-Sym.  
    Label all other words as O. 

In [None]:
def repl(m):
    '''
    regular expression function that replaces symtoms with BSYM if .group(0) is one word
    or BSYM + ISYM if multiple words - this function looks for spaces in the given words
    
    m.group(0) - firts match
    '''
    return ' '.join(['BSYM']+['ISYM']*(m.group(0).count(' ')))

In [None]:
covid_df['Token'] = covid_df['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)

Token column is the duplicate of the Sentence column. The only difference is that I replaced the existing symptoms in each sentence with BSYM or ISYM based on our regular expression function. 

re.IGNORECASE is used to ignore the words that are upper case and treat them all as lower case. This way, I was able to retain the original version of sentences without using .lower().

In [None]:
covid_df.head()

Unnamed: 0,Paper_Id,Sentence_ID,Sentence,Token
96,84d22b71f6df277a11824433ccf14137303f55f5,97,Subjects who died in hospital were significant...,Subjects who died in hospital were significant...
202,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,203,We retrospectively analyzed medical charts of ...,We retrospectively analyzed medical charts of ...
203,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,204,The patient first had 4-5 episodes of watery d...,The patient first had 4-5 episodes of watery B...
205,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,206,"However, she returned to the ED the next day w...","However, she returned to the ED the next day w..."
217,b382ff1b00757c3cb6a7408d8e993aa6d94d3e28,218,The patient reported an overall improvement in...,The patient reported an overall improvement in...


Reset the index column to start from 0 and assigned the Sentece number to the Sentence_ID column 

In [None]:
covid_df.reset_index(drop=True, inplace=True)

In [None]:
covid_df['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(covid_df) + 1)]

### Tokenize sentences

In this part of the task, I tokenized words and tokens columns and then replaces the tokens
with the actual display of B-SYM and I-SYM with the dashes added in the middle.

Then, provided a dataframe with three columns only

In [None]:
def tokenize_sentences(frame):
    words = []
    i = 0
    for j, row in frame.iterrows():
        for word, temptag in zip(word_tokenize(row['Sentence']), word_tokenize(row['Token'])):
    #         print(word, temptag)
            if temptag == 'BSYM':
                tag = 'B-SYM'
            elif temptag == 'ISYM':
                tag = 'I-SYM'
            else:
                tag = 'O'
            words.append((row['Sentence_ID'], word, tag))

    tag_df = pd.DataFrame(words, columns=['Sentence_ID', 'Words', 'Tag'])
    return tag_df

In [None]:
tag_df = tokenize_sentences(covid_df)

In [None]:
# tag_df.head(25)

Double ckecing the tag column to see if we were able to tag I-SYM symptom words

In [None]:
tag_df[tag_df['Tag']== 'I-SYM'].head(5)

Unnamed: 0,Sentence_ID,Words,Tag
181,Sentence #3,pain,I-SYM
213,Sentence #4,pain,I-SYM
223,Sentence #4,appetite,I-SYM
271,Sentence #5,pain,I-SYM
285,Sentence #6,chest,I-SYM


#### This part of the code is for sanity checking on our sentences that contain multi word symtomps. 

    Firts extract only the symptoms with multi words
    Then use our pattern to check which sentences have multi word symptoms presents
    Export them to csv file, if needed

In [None]:
# #Extract the list of multi symptom words
# multi_word_symptoms = ([symptom for symptom in symptoms if len(symptom.split())>1])

# #pattern to check for words that have space before and after 
# #this is to prevent our search from getting distracted by words like 'Statistics' when we are
# #looking for 'tic' which is a problem that I encountered.
# pattern_multi = '\\b' + '\\b|\\b'.join(multi_word_symptoms) + '\\b'

# #Get the dataframe that contains multi words
# covid_df[covid_df['Sentence'].str.contains(pattern_multi)]

# #Export them to a csv file if needed
# #multi_word_sentences.to_csv('/Users/elif/Desktop/multi_word_sentences.csv', index = False)

###  Here the sentence # should be consecutive integers (similar to a surrogate key in a database table) and is not the same as sentence ID in the first spreadsheet.

    Styling of sentence numbers ~ instead of duplicating the same number over and over again, we leave out the duplicated ones by replacing them with space

In [None]:
#tag_df['Sentence_ID'] = tag_df['Sentence']

In [None]:
def remove_duplicate_sentence_ids(df):
    is_duplicate = df['Sentence_ID'].duplicated()

    df['Sentence_ID'] = df['Sentence_ID'].where(~is_duplicate, ' ')
    tagged_data = df[['Sentence_ID', 'Words', 'Tag']]
    return tagged_data

In [None]:
tagged_data = remove_duplicate_sentence_ids(tag_df)

In [None]:
tagged_data.head()

Unnamed: 0,Sentence_ID,Words,Tag
0,Sentence #1,Subjects,O
1,,who,O
2,,died,O
3,,in,O
4,,hospital,O


In [None]:
tagged_data.to_csv('/Users/elif/Desktop/covid_tagged_data_May.csv', index = False, encoding= 'utf-8')

## Working with COVID data that is scraped from the web

In [None]:
paSi_df = pd.read_excel('/Users/elif/Downloads/OneDrive_1_10-29-2020/patient-site_lableled.xlsx', sheet_name='in', usecols="A,B,C")

In [None]:
plm_df.head()

Unnamed: 0,Sentence,Words,Tag
0,Sentence #1,I,O
1,,',O
2,,ve,O
3,,had,O
4,,Covid,B-DIS


In [None]:
plm_df = pd.read_excel('/Users/elif/Downloads/OneDrive_1_10-29-2020/plm_dataset_labeled.xlsx', sheet_name='plm_dataset', usecols="A,B,C")

In [None]:
paSi_df.head()

Unnamed: 0,Sentence,Words,Tag
0,Sentence #1,I,O
1,,',O
2,,ve,O
3,,had,O
4,,Covid,B-DIS


In [None]:
def symAndCount(dataframe):
    values_dict = {}
    sym_str = ''
    for i,row in enumerate(dataframe.values):
    
        if row[2]=='B-SYM'and dataframe.values[i+1][2] == 'O':
            if row[1].lower() in values_dict:
                values_dict[row[1].lower()]+=1
            else:
                values_dict[row[1].lower()] = 1
        if (row[2]=='B-SYM'and dataframe.values[i+1][2] == 'I-SYM') or row[2]=='I-SYM':
            sym_str +=str(row[1]).lower()+' '
            if dataframe.values[i+1][2] == 'O':
                sym_str = sym_str.strip()
                if sym_str in values_dict:
                    values_dict[sym_str]+=1
                else:
                    values_dict[sym_str] = 1
                sym_str = ''
    return values_dict


In [None]:
from collections import Counter

plm_sym_dict = symAndCount(plm_df)
paSi_sym_dict = symAndCount(paSi_df)

new_dict = dict(Counter(plm_sym_dict) + Counter(paSi_sym_dict))

In [None]:
import operator
sorted_d = dict( sorted(new_dict.items(), key=operator.itemgetter(1),reverse=True))

In [None]:
sorted_d

{'cough': 41,
 'fever': 32,
 'sore throat': 16,
 'headache': 15,
 'fatigue': 15,
 'headaches': 10,
 'shortness of breath': 9,
 'nausea': 9,
 'breathlessness': 8,
 'dry cough': 6,
 'fatigued': 5,
 'coughing': 5,
 'extreme fatigue': 5,
 'tired': 5,
 'chest pain': 5,
 'nauseous': 5,
 'tiredness': 5,
 'gastric issues': 5,
 'chest pains': 5,
 'anxiety': 4,
 'chills': 4,
 'burning sensation': 4,
 'temperature': 4,
 'temp': 4,
 'chest tightness': 4,
 'diarrhea': 4,
 'feverish': 4,
 'short of breath': 3,
 'shortness': 3,
 'high temperatures': 3,
 'weakness': 3,
 'dizzy': 3,
 'mild cough': 3,
 'gi symptoms': 3,
 'dizziness': 3,
 'muscle pain': 3,
 'tight chest': 3,
 'burning': 3,
 'body aches': 3,
 'slight cough': 2,
 'malaise': 2,
 'loss of smell': 2,
 'sense of smell has returned': 2,
 'throat': 2,
 'fevers': 2,
 'exhaustion': 2,
 'infection': 2,
 'mild headache': 2,
 'aches': 2,
 'upset stomach': 2,
 'night sweats': 2,
 'numbness': 2,
 'sinus pain': 2,
 'nasal congestion': 2,
 'dry nose': 2,

In [None]:
df_sym_freq = pd.DataFrame(list((dict(new_dict)).items()), columns = ['Symptom', 'Frequency'])
df_sym_freq.to_csv('sym_freq.csv',index=False)


only_syms = list((dict(new_dict)).keys())

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# wc = WordCloud(background_color = "black", width = 1000, height= 1000).generate_from_frequencies(new_dict)
# fig = plt.figure(figsize = (15,15))
# plt.imshow(wc, interpolation = "bilinear")
# plt.axis("off")
# plt.show()

In [None]:
df_sym_freq.groupby('Symptom').sum().sort_values(['Frequency'],ascending=False).head(25)

Unnamed: 0_level_0,Frequency
Symptom,Unnamed: 1_level_1
cough,41
fever,32
sore throat,16
fatigue,15
headache,15
headaches,10
shortness of breath,9
nausea,9
breathlessness,8
dry cough,6


In [None]:
sym_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182 entries, fever to platypnea
Data columns (total 1 columns):
Counts    182 non-null int64
dtypes: int64(1)
memory usage: 2.8+ KB


In [None]:
only_syms

['relapse',
 'anxiety',
 'fatigued',
 'wanting to sleep',
 'weird pressure around my head',
 'feels like',
 'pressure around my head and temples and even underneath my jaw and above my nose bridge',
 'water in my ear',
 'tension headache',
 'head hurts',
 'constant pressure all around rather than pain',
 'pressure in your head',
 'head rarely hurts',
 'brain inflammation',
 'headache',
 'coughing',
 'anxious',
 'breath a little better',
 'no fever',
 'no chills',
 'slight cough',
 'cough',
 'oxygen levels were good',
 'i was stable',
 'i was released',
 'helped me with my breathing',
 'helps breathing',
 'suffering from shortness of breath',
 'breathing gets better',
 'very difficult time breathing',
 'choking cough',
 'super weak',
 'could not breathe',
 'collapsed',
 'totally exhausts',
 'drop my oxygen sat rates',
 'fatigues',
 'weak',
 'short of breath',
 'got so weak',
 'lower immune status',
 'malaise',
 'extreme fatigue',
 'loss of smell',
 'poor cognitive thinking',
 'shortness

In [None]:
len(only_syms)

400

In [None]:
# yields the elements in `list_2` that are NOT in `list_1`
final_list = np.setdiff1d(symptoms,only_syms)

In [None]:
len(symptoms)

209

In [None]:
len(final_list)

177

In [None]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

In [None]:
intersection(symptoms, only_syms)

['chills',
 'dizziness',
 'sneezing',
 'vertigo',
 'fatigue',
 'malaise',
 'sweats',
 'swelling',
 'loss of smell',
 'loss of appetite',
 'chest pain',
 'dry mouth',
 'sore throat',
 'tinnitus',
 'fever',
 'diarrhea',
 'dyspepsia',
 'chest tightness',
 'nausea',
 'vomiting',
 'rash',
 'nasal congestion',
 'headache',
 'insomnia',
 'body aches',
 'anxiety',
 'cough',
 'shortness of breath',
 'congestion',
 'runny nose',
 'trouble breathing',
 'fevers']

In [None]:
len(intersection(symptoms, only_syms))

32

In [None]:
# import nltk
# nltk.download('perluniprops')

# from nltk.tokenize.moses import MosesDetokenizer


In [None]:
# plm_df['detoken']=plm_df['Sentence'].apply(lambda x: detokenizer.detokenize(x, return_str=True))

### Create a version of the COLL-DATA that only contains *sentences* with symptom terms

In [None]:
plm_df

Unnamed: 0,Sentence,Words,Tag
0,Sentence #1,I,O
1,,was,O
2,,diagnosed,O
3,,3,O
4,,days,O
...,...,...,...
8929,,support,O
8930,,and,O
8931,,emerging,O
8932,,information,O


In [None]:
def colloqual_data_processing(tagged_data, symptom_list):
    #Replace NaN values with emtpy spaces to match the original format 
    tagged_data['Sentence'] = tagged_data['Sentence'].replace(np.nan, ' ')
    
    #Take in the data frame and create a list of sentences
    sent_value = []
    sentence = []
    for name, values in tagged_data.iterrows():

        #print(values)
        if values['Sentence'].startswith('Sentence') == True:
            sent_value.append(sentence)
            sentence = []

        sentence.append(str(values['Words']))        
    
    #Put the full list together to have the full text to extract only the sentences that include symptoms 

    sentence_list = []
    for sentence in sent_value:
        if len(sentence) <= 1: #remove the sentences that have . or space as the only character 
            continue

        full_txt = " ".join(sentence).strip()
        sentence_list.append(str(full_txt))
    #join the sentences
    full_Sent = " ".join(sentence_list)

    #create a dataframe with the full text of words
    d = {'Sentence' : [full_Sent]}
    col_df = pd.DataFrame(data=d)
    
    #Tokenize the sentences to have one sentence per row 
    sentences = []
    sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])'
    for row in col_df.itertuples():            
         for sentence in re.split(sentence_pattern, row[1]):
                sentences.append((row[0], sentence))
                

    collo_df = pd.DataFrame(sentences, columns=['Index','Sentence'])
#     print(collo_df.head())
    
    collo_df.drop('Index', axis = 1, inplace = True)
    
    #Create a pattern to check for sentences that contain the symtoms 
    new_pattern = '\\b' + '\\b|\\b'.join(symptom_list) + '\\b'
    colloqual_df = collo_df[collo_df['Sentence'].str.contains(new_pattern, flags = re.IGNORECASE)].copy()
#     print(colloqual_df.head())

    return sent_value, colloqual_df


In [None]:
sent_value, colloqual_df = colloqual_data_processing(plm_df, only_syms)

In [None]:
colloqual_df.head()

Unnamed: 0,Sentence
0,I was diagnosed 3 days ago however symptoms st...
1,Isolated starting 8 days ago but as symptoms m...
2,Take the time to recover even as symptoms ligh...
3,"I ' ve been extremely fatigued , wanting to sl..."
4,Also my biggest worry I have is this weird pre...


In [None]:
def colloqual_df_info(sent, df):
    print('Total number of sentences in this file is:', len(sent))
    print('Number of sentences that include any of the provided symtoms is:', len(df))
    print('Number of sentences that do NOT include any of the provided symptoms is:', (len(sent) - len(df)))
    
    

In [None]:
colloqual_df_info(sent_value, colloqual_df)

Total number of sentences in this file is: 531
Number of sentences that include any of the provided symtoms is: 168
Number of sentences that do NOT include any of the provided symptoms is: 363


## Now lets put those sentences into tagged format

In [None]:
colloqual_df.head()

Unnamed: 0,Sentence
0,I was diagnosed 3 days ago however symptoms st...
1,Isolated starting 8 days ago but as symptoms m...
3,Take the time to recover even as symptoms ligh...
6,"I ' ve been extremely fatigued , wanting to sl..."
8,Also my biggest worry I have is this weird pre...


In [None]:
colloqual_df.reset_index(drop=True, inplace=True)

In [None]:
colloqual_df.head()

Unnamed: 0,Sentence
0,I was diagnosed 3 days ago however symptoms st...
1,Isolated starting 8 days ago but as symptoms m...
2,Take the time to recover even as symptoms ligh...
3,"I ' ve been extremely fatigued , wanting to sl..."
4,Also my biggest worry I have is this weird pre...


In [None]:
plm_peS_df = sentence_w_symptoms(colloqual_df, symptoms)

In [None]:
plm_peS_df.head()

Unnamed: 0,Sentence
2,Take the time to recover even as symptoms ligh...
7,I spoke to a doctor at Kaiser and they suggest...
15,Tylenol for headache to see if it helps .
20,No fever .
21,No chills .


In [None]:
plm_peS_df['Token'] = plm_peS_df['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)
plm_peS_df['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(plm_peS_df) + 1)]


In [None]:
plm_peS_df.head()

Unnamed: 0,Sentence,Token,Sentence_ID
2,Take the time to recover even as symptoms ligh...,Take the time to recover even as symptoms ligh...,Sentence #1
7,I spoke to a doctor at Kaiser and they suggest...,I spoke to a doctor at Kaiser and they suggest...,Sentence #2
15,Tylenol for headache to see if it helps .,Tylenol for BSYM to see if it helps .,Sentence #3
20,No fever .,No BSYM .,Sentence #4
21,No chills .,No BSYM .,Sentence #5


In [None]:
df5 = tokenize_sentences(plm_peS_df)
plm_peS_df_out=remove_duplicate_sentence_ids(df5)

In [None]:
plm_peS_df_out=remove_duplicate_sentence_ids(df5)

In [None]:
plm_peS_df_out.to_csv('/Users/elif/Desktop/Existing_Colloquial_data_prior_Nov5.csv', index = False, encoding= 'utf-8')

## Scraped Data from the sites

In [None]:
scraped_data = pd.read_csv('/Users/elif/Downloads/combined_csv-4.csv')

In [None]:
scraped_data['Sentence'][8]

"Every cooked food smells like garlic and when I cook something, all I can smell is garlic. It's frustrating."

In [None]:
#Count of original symptoms
symptoms_df(scraped_data)#, symp = symptoms)

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
cough,61
fever,49
headache,33
fatigue,29
anxiety,28
sore throat,17
shortness of breath,16
dizziness,12
vertigo,10
congestion,9


In [None]:
len(symptoms_df(scraped_data))

43

In [None]:
#count of symptomps from the patience like me files
symptoms_df(scraped_data, only_syms)

Unnamed: 0_level_0,Counts
Symptoms,Unnamed: 1_level_1
symptoms,146
feeling,63
breath,49
cough,46
taste,43
...,...
hives,1
high temperatures,1
hayfever,1
get blurry,1


In [None]:
sentences = []
sentence_pattern = r'(?<=[^A-Z].[.?]) +(?=[A-Z])'
for row in scraped_data.itertuples():            
    for sentence in re.split(sentence_pattern, row[1]):
        sentences.append((row[0], sentence))
                

collo_df_new = pd.DataFrame(sentences, columns=['Index','Sentence'])

collo_df_new.drop('Index', axis = 1, inplace = True)

In [None]:
collo_df_new['Sentence'][25]

'That should help you'

In [None]:
colloqual_new_data = sentence_w_symptoms(collo_df_new, symptoms)

In [None]:
colloqual_new_data.head()

Unnamed: 0,Sentence
3,I've been feeling some fatigue and pelvic pain...
30,I am waiting on a procedure to help with back ...
33,I was convinced I had Covid in early March of ...
41,My husband tested negative..I'm beginning to w...
68,I first became unwell on 03.05.20 with a cough...


In [None]:
colloqual_new_data['Token'] = colloqual_new_data['Sentence'].str.replace(pattern, repl, flags = re.IGNORECASE)
colloqual_new_data['Sentence_ID'] = ['Sentence #%s' %i for i in range(1, len(colloqual_new_data) + 1)]
df6 = tokenize_sentences(colloqual_new_data)
tagged_scraped_data = remove_duplicate_sentence_ids(df6)

In [None]:
tagged_scraped_data.head()

Unnamed: 0,Sentence_ID,Words,Tag
0,Sentence #1,I,O
1,,'ve,O
2,,been,O
3,,feeling,O
4,,some,O


In [None]:
tagged_scraped_data.to_csv('/Users/elif/Desktop/New_Colloquial_data_Nov5.csv', index = False, encoding= 'utf-8')