# Fetch and process data using Jupyter Notebook

Jupyter Notebook - multi-lingual environment for making things done

In [1]:
from glob import glob
import shutil
import os
import re

import requests
import xmltodict

# Fetching Badge Data

SOURCE: https://archive.org/download/stackexchange

DATA EXPLORER: http://data.stackexchange.com/stackoverflow/queries (https://github.com/StackExchange/StackExchange.DataExplorer)

Stackoverflow badge data were split into chunk files of size approx 12 MB (each containing around 100k badge rows). They are all accessible via the `https://s3.eu-central-1.amazonaws.com/learning.big.data/stackoverflow-badges/<i>.xml` where `i` takes values from `0` to `286` to more than 3 GB of data. 

In [4]:
%%bash
cat tools/download_and_chunks_badges.sh

#!/bin/bash

TEMP_DIR=$(mktemp -d)
cd $TEMP_DIR

echo ">> DOWNLOADING BADGES"
wget https://archive.org/download/stackexchange/stackoverflow.com-Badges.7z

echo ">> EXTRACTING BADGES"
dtrx -o -n stackoverflow.com-Badges.7z

# -- remove last and 1st two lines
cd stackoverflow.com-Badges
echo ">> CLEAN BADGES"
sed '$ d' Badges.xml > TempBadges.xml
sed '1,2d' TempBadges.xml > Badges.xml

# -- split into manageable chunks
echo ">> SPLIT BADGES"
split --lines 100000 --additional-suffix=.xml Badges.xml

# -- remove temporary stuff
echo ">> CLEAN ARTEFACTS"
cd ..
rm stackoverflow.com-Badges/TempBadges.xml stackoverflow.com-Badges/Badges.xml stackoverflow.com-Badges.7z

echo "files saved in ${TEMP_DIR}"


In [6]:
# -- rename files to prepare them for the S3 upload
for i, path in enumerate(glob('/tmp/tmp.gEG1iKNNPu/stackoverflow.com-Badges/*.xml')):
    if len(os.path.basename(path)) == 7:        
        numerical_path = re.sub(r'([a-z]+)\.xml', f'{i}.xml', path)
        shutil.move(path, numerical_path)        

In [5]:
# use s3cmd for uploading

In [7]:
response = requests.get(
    'https://s3.eu-central-1.amazonaws.com/learning.big.data/stackoverflow-badges/0.xml', 
    stream=True)

badges = []
for line in response.iter_lines():

    # -- filter out keep-alive new lines
    if line:
        badges.append(dict(xmltodict.parse(line.decode('utf-8'))['row']))

In [8]:
len(badges)

100000

In [9]:
badges[:2]

[{'@Id': '26066242',
  '@UserId': '8125167',
  '@Name': 'Supporter',
  '@Date': '2017-11-28T19:34:25.047',
  '@Class': '3',
  '@TagBased': 'False'},
 {'@Id': '26066243',
  '@UserId': '9006638',
  '@Name': 'Supporter',
  '@Date': '2017-11-28T19:34:25.047',
  '@Class': '3',
  '@TagBased': 'False'}]

## What Badge Names do we have?

In [15]:
counts = {}
for badge in badges:
    counts.setdefault(badge['@Name'], 0)
    counts[badge['@Name']] += 1

sorted(counts.items(), key=lambda x: x[1], reverse=True)[:10]

[('Popular Question', 14687),
 ('Informed', 9557),
 ('Notable Question', 7466),
 ('Editor', 7230),
 ('Yearling', 5903),
 ('Student', 5782),
 ('Scholar', 4900),
 ('Custodian', 4357),
 ('Nice Answer', 3570),
 ('Teacher', 3568)]