# Scraping StackOverflow

## Testing code from SO Question
https://stackoverflow.com/questions/65611633/scraping-dynamic-website-with-filters-python

In [None]:
import bs4
import requests
import pandas as pd

url = "https://www.feedtables.com/fr/content/table-dry-matter"
headers = {"user-agent": "Mozilla/5.0"}

page = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(page.text,'lxml')

df_list = []

for url in [url+'?feed_cat='+option['value']+'&parameter_cat=All' for option in soup.find('select',attrs={'name': 'feed_cat'}).find_all('option')][1:3]:
    df_list.append(pd.read_html(url)[0])

df = df_list[0].dropna(how='all')
df

## YouTube Tutorial
https://www.youtube.com/watch?v=BFAQCDr6Qvc


In [85]:
import requests
from requests_html import HTML
import pandas as pd 
import time
import re

In [2]:
base_url = "https://stackoverflow.com/questions/tagged/"
tag = "dask"
query_filter = "Newest"
url = f"{base_url}{tag}?tab={query_filter}"
url

'https://stackoverflow.com/questions/tagged/dask?tab=Newest'

In [3]:
r = requests.get(url)
html_str = r.text
html = HTML(html=html_str)

In [24]:
question_elements = html.find(".s-post-summary")

In [25]:
print(question_elements[0].text)

1 vote
0 answers
17 views
Dask Dataframe read_sql_table & to_sql method throwing TypeError - Provided index column is of type "object"
Problem/Context: The problem i am facing is i am trying to insert the dask dataframe to postgres db but it seems i am going through bad luck. I tried almost 20-30 times by changing param also ...
python
postgresql
sqlalchemy
dask
dask-dataframe
simpleboi
91
asked 3 hours ago


## Get Data for 1 Question

In [8]:
# get most recent question element
this_question_element = question_summaries[0]

In [15]:
# inspect element
this_question_element.text

'1 vote\n0 answers\n17 views\nDask Dataframe read_sql_table & to_sql method throwing TypeError - Provided index column is of type "object"\nProblem/Context: The problem i am facing is i am trying to insert the dask dataframe to postgres db but it seems i am going through bad luck. I tried almost 20-30 times by changing param also ...\npython\npostgresql\nsqlalchemy\ndask\ndask-dataframe\nsimpleboi\n91\nasked 3 hours ago'

In [9]:
# get question title
this_question_element.find('.s-link', first=True).text

'Dask Dataframe read_sql_table & to_sql method throwing TypeError - Provided index column is of type "object"'

In [13]:
# get stats (formatted more nicely)
this_question_element.find('.s-post-summary--stats', first=True).text.replace('\n', ' ')

'1 vote 0 answers 17 views'

In [17]:
# get metadata
this_question_element.find('.s-post-summary--meta', first=True).text.replace('\n', ' ')

'python postgresql sqlalchemy dask dask-dataframe simpleboi 91 asked 3 hours ago'

In [23]:
# get question excerpt
this_question_element.find('.s-post-summary--content-excerpt', first=True).text

'Problem/Context: The problem i am facing is i am trying to insert the dask dataframe to postgres db but it seems i am going through bad luck. I tried almost 20-30 times by changing param also ...'

In [33]:
# get question title
this_question_element.find('.s-post-summary--content-title', first=True).text

'Dask Dataframe read_sql_table & to_sql method throwing TypeError - Provided index column is of type "object"'

In [32]:
# get question hyperlink
this_question_element.find('.s-link', first=True)

<Element 'a' href='/questions/74299918/dask-dataframe-read-sql-table-to-sql-method-throwing-typeerror-provided-inde' class=('s-link',)>

## Get Data for All Questions

## What do I want

I want to ideally get:
- the question title
- answered / unanswered status ( get from n_answers )
- number of answers
- number of votes
- number of views
- tags (this one might be tricky)
- the question hyperlink (unsure yet how to do exactly)
- timestamp would be nice
- 

In [66]:
# define keynames and quetions needed
keynames = ['title', 'stats', 'tags']
classes_needed = ['.s-post-summary--content-title', '.s-post-summary--stats', '.s-post-summary--meta-tags',]

In [67]:
datas = []

for q_el in question_elements:
    q_data = {}
    for i, _class in enumerate(classes_needed):
        sub_el = q_el.find(_class, first=True)
        keyname = keynames[i]
        q_data[keyname] = sub_el.text 
    datas.append(q_data)

In [68]:
df = pd.DataFrame(datas)
df.head(3)

Unnamed: 0,title,stats,tags
0,Dask Dataframe read_sql_table & to_sql method ...,1 vote\n0 answers\n17 views,python\npostgresql\nsqlalchemy\ndask\ndask-dat...
1,Dask running out of memory even when partition...,0 votes\n0 answers\n23 views,dask\ndask-distributed\ndask-dataframe
2,Disable pure function assumption in dask distr...,0 votes\n0 answers\n11 views,python\ndask\ndask-distributed


OK, the basics are working here.

Let's now refine:
- Separate votes, answers, and views into separate columns
- Separate tags into separate colum

In [47]:
# get stats and split
this_question_element.find('.s-post-summary--stats', first=True).text.split("\n")

['1 vote', '0 answers', '17 views']

In [108]:
# get only numbers
stats_test = re.findall(r'\d+', this_question_element.find('.s-post-summary--stats', first=True).text)
stats_test

['1', '0', '17']

In [65]:
# get just tags
this_question_element.find('.s-post-summary--meta-tags', first=True).text.split('\n')

['python', 'postgresql', 'sqlalchemy', 'dask', 'dask-dataframe']

OK, we now have clean tags and stats. Let's try this again:

In [92]:
# function that will clean the scraped data



def clean_scraped_data(text, keyname=None):
    if keyname == 'stats':
        return re.findall(r'\b\d+\b', text)
    elif keyname == 'tags':
        return text.split("\n")
    return text

In [93]:
datas = []

for q_el in question_elements:
    q_data = {}
    for i, _class in enumerate(classes_needed):
        sub_el = q_el.find(_class, first=True)
        keyname = keynames[i]
        q_data[keyname] = clean_scraped_data(sub_el.text, keyname=keyname) 
    datas.append(q_data)

In [94]:
df = pd.DataFrame(datas)
df.head()

Unnamed: 0,title,stats,tags
0,Dask Dataframe read_sql_table & to_sql method ...,"[1, 0, 17]","[python, postgresql, sqlalchemy, dask, dask-da..."
1,Dask running out of memory even when partition...,"[0, 0, 23]","[dask, dask-distributed, dask-dataframe]"
2,Disable pure function assumption in dask distr...,"[0, 0, 11]","[python, dask, dask-distributed]"
3,Drop rows from Dask DataFrame where column cou...,"[2, 3, 52]","[python, pandas, dataframe, dask, dask-dataframe]"
4,Python Dask max() function on a column not wor...,"[1, 1, 16]","[python, dask]"


Sweet. This is working for the first 50 results!

In [187]:
# new function that will clean stats better
def clean_scraped_data2(text, keyname=None):
    if keyname == 'stats':
        rep = {" votes": "", " answer": "", "s": "", " views": ""} # define desired replacements here
        # use these three lines to do the replacement
        rep = dict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
        text = text.split('\n')
        new_text = []
        for n in text:
            res = re.sub('k', '000', n)
            n = res
            new_text.append(n)
        return new_text
    
    elif keyname == 'tags':
        return text.split("\n")
    return text

In [188]:
# define function that will parse a single page

def parse_tagged_page(html):
    question_elements = html.find(".s-post-summary")
    keynames = ['title', 'stats', 'tags']
    classes_needed = ['.s-post-summary--content-title', '.s-post-summary--stats', '.s-post-summary--meta-tags',]
    datas = []
    for q_el in question_elements:
        q_data = {}
        for i, _class in enumerate(classes_needed):
            sub_el = q_el.find(_class, first=True)
            keyname = keynames[i]
            q_data[keyname] = clean_scraped_data2(sub_el.text, keyname=keyname) 
        datas.append(q_data)
    return datas

In [189]:
# define function that will extract data from url
def extract_data_from_url(url):
    r = requests.get(url)
    if r.status_code not in range(200, 299):
        return []
    html_str = r.text
    html = HTML(html=html_str)
    datas = parse_tagged_page(html)
    return datas

In [193]:
# function that will scrape the entire tag
def scrape_tag(tag = "python", query_filter = "Newest", max_pages=100, pagesize=50):
    base_url = 'https://stackoverflow.com/questions/tagged/'
    datas = []
    for p in range(max_pages):
        page_num = p + 1
        url = f"{base_url}{tag}?tab={query_filter}&page={page_num}&pagesize={pagesize}"
        datas += extract_data_from_url(url)
        time.sleep(1.2)
    return datas

In [None]:
%%time 
datas = scrape_tag(tag='dask')

In [None]:
df = pd.DataFrame(datas)
df.head()

In [None]:
len(df)

## Clean up dataframe

In [106]:
# get stats into separate columns
df[['votes', 'answers', 'views']] = pd.DataFrame(df['stats'].to_list())
df = df.drop(columns=['stats'])
df.head()

Unnamed: 0,title,tags,votes,answers,views
0,Dask Dataframe read_sql_table & to_sql method ...,"[python, postgresql, sqlalchemy, dask, dask-da...",1,0,18
1,Dask running out of memory even when partition...,"[dask, dask-distributed, dask-dataframe]",0,0,23
2,Disable pure function assumption in dask distr...,"[python, dask, dask-distributed]",0,0,11
3,Drop rows from Dask DataFrame where column cou...,"[python, pandas, dataframe, dask, dask-dataframe]",2,3,52
4,Python Dask max() function on a column not wor...,"[python, dask]",1,1,16


In [None]:
#TO DO
# 1. Fetch question URL 
# 2. Fetch question timestamp
# 3. Create "answered" true/false column
# 4. For fun: use Futures to fetch data in parallel

## Run some analyses

In [107]:
# most-upvoted questions
df.sort_values('votes', ascending=False).head(10)

Unnamed: 0,title,tags,votes,answers,views
3706,"ValueError: Not all divisions are known, can't...","[python, dataframe, dask, dask-distributed]",9,1,
2115,Only a column name can be used for the key in ...,"[python, pandas, dask]",9,1,
4085,Item assignment to Python dask array objects,"[python-2.7, dask]",9,1,
3575,dask.multiprocessing or pandas + multiprocessi...,"[python, multithreading, pandas, multiprocessi...",9,1,
3577,dask apply: AttributeError: 'DataFrame' object...,"[python, dask]",9,1,
3453,add a dask.array column to a dask.dataframe,"[python, dataframe, dask]",9,2,
3433,Summarize categorical data in Dask DataFrame,"[python, dask]",9,1,747.0
4049,How can I select data from a dask dataframe by...,"[python, indexing, dask]",9,2,
3151,dask: specify number of processes,"[python, dask]",9,2,
3407,Dask delayed object of unspecified length not ...,"[python, dictionary, dask, dask-delayed]",9,1,


hmm.. something wrong with the "views" column here. It's because we've used the regex and SO subs thousands with "K", e.g. 3.1K views. That'll need a more fine-grained regex to pull it out.

In [124]:
base_url = "https://stackoverflow.com/questions/tagged/"
tag = "dask"
query_filter = "Votes"
url = f"{base_url}{tag}?tab={query_filter}"
url

'https://stackoverflow.com/questions/tagged/dask?tab=Votes'

In [125]:
# test with specific question that has 3.1K views
r = requests.get(url)
html_str = r.text
html = HTML(html=html_str)
q_elements = html.find(".s-post-summary")

In [129]:
test_element = q_elements[0]

In [138]:
out = test_element.find('.s-post-summary--stats', first=True).text
out = out.replace(' votes', '')
out = out.replace(' answers', '')
out = out.replace(' views', '')
out = out.split('\n')
out

['170', '12', '126k']

In [141]:
rep = {" votes": "", " answers": "", " views": ""} # define desired replacements here

# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in rep.items()) 
#Python 3 renamed dict.iteritems to dict.items so use rep.items() for latest versions
pattern = re.compile("|".join(rep.keys()))
text = pattern.sub(lambda m: rep[re.escape(m.group(0))], test_element.find('.s-post-summary--stats', first=True).text)
text = text.split('\n')


In [142]:
text

['170', '12', '126k']

OK, getting close. Just need to substitute "k" with "000"

In [157]:
new = []
for n in text:
    res = re.sub('k', '000', n)
    n = res
    new.append(n)

In [158]:
new

['170', '12', '126000']

Yes, that looks like it worked.
Now just have to rework this into the script above.

In [None]:
# most-viewed questions


In [None]:
# most common tags