In [None]:
import requests
import json
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from calendar import timegm

def strf(s):
    return time.strftime('%Y-%m-%d %H:%M', time.gmtime(s))

In [None]:
def fetch_all_pages(method, params, max_pages=1000):
    data = []
    has_more = True
    page = 1
    while has_more and page<=max_pages:
        params['page'] = page
        try:
            resp = requests.get('https://api.stackexchange.com/2.3/' + method, params=params).json()
            page+=1
            data += resp['items']
            has_more = resp['has_more']
        except Exception as e:
            raise Exception(f'last response: {resp}')
    return data

In [None]:
def fetch_related(acc_token, key, qid, 
                  from_date : str | int = None, 
                  to_date : str | int = None,
                 ):
    params = {
        'site': 'stats',
        'pagesize': 100,
        'sort': 'creation',
        'order': 'asc',
        'key': key,
        'access_token': acc_token,
       # 'filter': '!nNPvSNPI7A', # include body
    }
    for arg,key in zip([from_date, to_date], ['fromdate', 'todate']):
        if arg is not None:
            if isinstance(arg, int):
                params[key] = arg
            elif isinstance(arg, str):
                params[key] = int(timegm(time.strptime(arg, '%Y-%m-%d %H:%M')))
            else:
                raise ValueError(f'unsupported type `{arg}`: {type(arg)}')
        
    method = f"questions/{qid}/related"
    data = fetch_all_pages(method, params)
    return data

### API Retrieval

First retrieve questions, then their related questions

In [1]:
acc_token = '##########'
key = '########'

In [None]:
params = {
        'site': 'stats',
        'pagesize': 100,
        'sort': 'creation',
        'order': 'asc',
        'key': key,
        'access_token': acc_token,
        'fromdate': int(timegm(time.strptime('2014-01-01 00:00', '%Y-%m-%d %H:%M'))),
        'todate': int(timegm(time.strptime('2015-01-01 00:00', '%Y-%m-%d %H:%M'))),    
        #'filter': '!nNPvSNPI7A',
}

method = 'questions'
questions = fetch_all_pages(method, params)
#resp = requests.get('https://api.stackexchange.com/2.3/' + method, params=params)#.json() 

In [None]:
related = {}

In [None]:
e = None
for q in tqdm(questions[len(related):]):
    try:
        data = fetch_related(acc_token, key, q['Id'], to_date=int(q['uxtime']))
        related[q['Id']] = data
    except Exception as err:
        time.sleep(3)
        e = err
        if not isinstance(e, requests.exceptions.ConnectTimeout):
            break

In [None]:
with open('related_15835_2014-01-01_2014-12-31.json', 'w') as f:
    json.dump(related, f)

with open('questions_15835_2014-01-01_2014-12-31_related.json', 'w') as f:
    json.dump(related, f)

### Scraping

Note: 404 pages result when you access deleted questions. Those pages contain a list of related questions too (related to the original one)

In [None]:
headers = {
    "User-Agent": 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0'
}
headers2 = { 'User-Agent': 'bot 1.1' }

In [None]:
from collections import defaultdict

In [None]:
resps = []
redirected = []
redirect_attempts = defaultdict(int)
ids404 = {}

In [None]:
exc = None
with tqdm(total=len(ids)-start_ix) as pbar:
    while i<len(ids):
        try:
            status = 429
            while status==429:
                resp = requests.get(f'https://stats.stackexchange.com/questions/{ids[i]}',
                            headers=headers)
                status = resp.status_code
                if status==429:
                    if redirect_attempts[ids[i]]==10:
                        break
                    redirect_attempts[ids[i]]+=1
                    time.sleep(10)
            
        except requests.exceptions.TooManyRedirects as e:
            redirected.append(ids[i])
            i+=1
            pbar.update(1)
            continue
        except Exception as e:
            exc = e
            resps.append(resp)
            raise
        soup = BeautifulSoup(resp.content, 'lxml')
        
        if status==404:
            ids404[ids[i]] = resp
            links = soup.find_all('a', {'class': 'question-not-found'})
            related[ids[i]] = [ int(l['href'].split('/')[2]) for l in links]
        elif status==200:    
            rel = soup.find_all('div', {'class':'related js-gps-related-questions'})
            if len(rel)==0:
                related[ids[i]] = []
            elif len(rel)==1:
                links = rel[0].find_all('a', {'class': 'question-hyperlink'})
                related[ids[i]] = [ int(l['href'].split('/')[2]) for l in links]
            else:
                raise ValueError(f'rel sz: {len(rel)}, id: {ids[i]}')
            
            #lin = soup.find_all('div', {'class':'linked'})
            #if len(lin)==0:
            #    linked[ids[i]] = []
            #elif len(lin)==1:
            #    links = lin[0].find_all('a', {'class':'question-hyperlink'})
            #    linked[ids[i]] = [ int(l['href'].split('/')[2]) for l in links]
            #else:
            #    raise ValueError(f'linked sz: {len(lin)}, id: {ids[i]}')
        elif status==429:
            pass
        else:
            resps.append(resp)
            raise ValueError(f'weird status: {status}')
        i+=1
        pbar.update(1)

Reputation history

In [None]:
for i in tqdm(range(ix, len(ids)//100), total=len(ids)//100-ix):
    id_batch = ';'.join([x for x in ids[i*100 : (i+1)*100]])
    method = f'users/{id_batch}/reputation-history'
    hist += fetch_all_pages(method, params)
    time.sleep(3)

In [None]:
with open('reputation_hist_5.json', 'w') as f:
    json.dump({'items': hist}, f)