## Scraping Coronavirus Data on MedHelp: [MedHelp Coronavirus Communities](https://www.medhelp.org/forums/Coronavirus/show/2203)
#### Contributor: Grace Pham

In [1]:
#import libraries and packages
import urllib.request
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import requests
import pandas as pd

In [2]:
#import data for web scraping

urls = "file:///c:/users/vrts/info212/medhelp/medhelp0.html" #data retrieved at 12:00 AM 6/18/2020
htmlfile = urllib.request.urlopen(urls)
soup = BeautifulSoup(htmlfile,"html.parser")
p11 = soup.find_all('div', attrs={'class':'subj_entry'})

In [17]:
#scraping data about the threads' topics and the users creating the threads
records = []
for p in p11:
    topic = p.find('h2', attrs={'class':'subj_title'}).text[1:-1]
    user = p.find('div', attrs={'class':'username'}).text[1:-1]
    records.append((topic,user))

In [18]:
#create a dataframe containing information about thread subjects (topics) and usernames (user)
df = pd.DataFrame(records, columns=['topic', 'user'])
df

Unnamed: 0,topic,user
0,Concerns about Covid-19 coronavirus,genoolli
1,Fevers,YSI
2,Partner and I symptomatic at nearly the same t...,cj_102
3,I'm going nuts because of coronavirus. Help me,A_khan25
4,"Sore throat, some dry cough and weird feeling ...",A_khan25
...,...,...
78,Just having cold is that symtom of corona covi...,amolshinde
79,corona?,Tom13345
80,This thread is for questions about the age of ...,AnxiousNoMore
81,Am I High Risk for Covid19? I am a diabetic wh...,CathyUssery


In [6]:
#create a list of links showing full post contents and responses of all threads
p12 = soup.find_all('h2', attrs={'class':'subj_title'})
linkposts = []
for p02 in p12:
    post0 = p02.find('a')['href']
    linkposts.append(post0)

In [7]:
#create a list of adjusted links showing full post contents and responses of all threads
URLposts=[]
for link in linkposts:
    urlpost = "https://www.medhelp.org"+link
    URLposts.append(urlpost)

In [8]:
#scrap web data about each thread's full content and the responses to the thread
headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }
post_content = []

for i in range(len(URLposts)):
    
    URL = URLposts[i]
    r2 = requests.get(URL,headers=headers)

    from bs4 import BeautifulSoup
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    p2 = soup2.find_all('div', attrs={'class':'mh_vit_mid'})
    replies = soup2.find_all('div', attrs={'class':'mh_vit_card post_show'})
    
    fullcontent = p2[0].find('div', attrs={'class':'subj_body'}).text[1:-1]  
    replies_count = soup2.find_all('div', attrs={'class':'resp_body'})
    
    post_content.append((fullcontent, replies_count))

In [19]:
#create a dataframe containning collected data about all threads' full contents (fullcontent) and the responses to each thread (response)
dftest = pd.DataFrame(post_content, columns=['fullcontent','response'])
dftest

Unnamed: 0,fullcontent,response
0,\n I'm pretty nervous about cor...,[[\n It's understandable that you'r...
1,\n I may have been exposed to C...,[[\n Only way to know is to get a t...
2,\n For the past three weeks my ...,"[[\n I assume you meant episodes 1,..."
3,\n I don't know if this is the ...,"[[\n I know it's scary, and I can't..."
4,\n I have this weird feeling in...,[[\n If the lump feeling in the thr...
...,...,...
78,\n I am suffering from cold due...,"[[\n The symptoms are Dry cough, Fe..."
79,\n Does the normal Flu have SOB...,"[[\n It can, but it doesn't typical..."
80,\n Anyone who reads a lot of ar...,"[[\n Yeah, without getting politica..."
81,\n I had a septic blood disease...,[[\n Those who are higher risk of n...


In [10]:
#convert all values in the dataframe "df2" into string format
df2 = dftest.astype(str)

In [12]:
#cleaning response column
df2.response.replace(['</div>,'],'|', regex=True,inplace=True)
df2.response.replace(["<br/> <br/>", "\n","\r","</div>]"],'', regex=True,inplace=True)
df2.response.replace(['\[<div class="resp_body" itemprop="text">','<div class="resp_body" itemprop="text">','            ','        '],'', regex=True,inplace=True)
df2.response.replace(["\xa0\xa0","\r<br/>","<br/>"],' ', regex=True,inplace=True)
df2.response.replace(["\'","    '"],"'", regex=True,inplace=True)
df2

Unnamed: 0,fullcontent,response
0,\n I'm pretty nervous about cor...,It's understandable that you're confused - the...
1,\n I may have been exposed to C...,Only way to know is to get a test that actuall...
2,\n For the past three weeks my ...,"I assume you meant episodes 1, 2 and 3 not day..."
3,\n I don't know if this is the ...,"I know it's scary, and I can't imagine having ..."
4,\n I have this weird feeling in...,If the lump feeling in the throat goes away ri...
...,...,...
78,\n I am suffering from cold due...,"The symptoms are Dry cough, Fever, and shortne..."
79,\n Does the normal Flu have SOB...,"It can, but it doesn't typically. https://www...."
80,\n Anyone who reads a lot of ar...,"Yeah, without getting political, I think we ne..."
81,\n I had a septic blood disease...,Those who are higher risk of novel coronavirus...


In [14]:
#cleaning fullcontent column
df2.fullcontent.replace(["\r","\n                ", "\n          "],'', regex=True,inplace=True)
df2

Unnamed: 0,fullcontent,response
0,I'm pretty nervous about coronavirus COVID-19....,It's understandable that you're confused - the...
1,I may have been exposed to Covid on March 13. ...,Only way to know is to get a test that actuall...
2,For the past three weeks my partner and I have...,"I assume you meant episodes 1, 2 and 3 not day..."
3,I don't know if this is the right place place ...,"I know it's scary, and I can't imagine having ..."
4,I have this weird feeling in my throat. When I...,If the lump feeling in the throat goes away ri...
...,...,...
78,I am suffering from cold due to weather change...,"The symptoms are Dry cough, Fever, and shortne..."
79,Does the normal Flu have SOB?,"It can, but it doesn't typically. https://www...."
80,Anyone who reads a lot of articles about Covid...,"Yeah, without getting political, I think we ne..."
81,I had a septic blood disease 5 years ago which...,Those who are higher risk of novel coronavirus...


In [15]:
#concatnate two dataframes "df" and "df2" together in a new dataframe named "info"
info = pd.concat([df, df2], axis=1)
info

Unnamed: 0,topic,user,fullcontent,response
0,Concerns about Covid-19 coronavirus,genoolli,I'm pretty nervous about coronavirus COVID-19....,It's understandable that you're confused - the...
1,Fevers,YSI,I may have been exposed to Covid on March 13. ...,Only way to know is to get a test that actuall...
2,Partner and I symptomatic at nearly the same t...,cj_102,For the past three weeks my partner and I have...,"I assume you meant episodes 1, 2 and 3 not day..."
3,I'm going nuts because of coronavirus. Help me,A_khan25,I don't know if this is the right place place ...,"I know it's scary, and I can't imagine having ..."
4,"Sore throat, some dry cough and weird feeling ...",A_khan25,I have this weird feeling in my throat. When I...,If the lump feeling in the throat goes away ri...
...,...,...,...,...
78,Just having cold is that symtom of corona covi...,amolshinde,I am suffering from cold due to weather change...,"The symptoms are Dry cough, Fever, and shortne..."
79,corona?,Tom13345,Does the normal Flu have SOB?,"It can, but it doesn't typically. https://www...."
80,This thread is for questions about the age of ...,AnxiousNoMore,Anyone who reads a lot of articles about Covid...,"Yeah, without getting political, I think we ne..."
81,Am I High Risk for Covid19? I am a diabetic wh...,CathyUssery,I had a septic blood disease 5 years ago which...,Those who are higher risk of novel coronavirus...


In [16]:
#convert dataframe into csv file
info.to_csv('05_25.csv')