# DEMO: Stackoverflow dump

## CURARE data collection model

#### Type your dataset's URL or leave default value

In [1]:
!pip install pyspark



In [2]:
import shutil
import os
from os import listdir

from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
import ipywidgets as widgets

In [4]:
url_tb = widgets.Text(value='https://data.stackexchange.com/stackoverflow/query/new', description='Dataset url:', disabled=False); url_tb

Text(value='https://data.stackexchange.com/stackoverflow/query/new', description='Dataset url:')

#### Type additional information about your dataset or leve default values

In [5]:
dbprovider_tb = widgets.Text(value='Stack Exchange, Inc.', description='provider:', disabled=False); dbprovider_tb

Text(value='Stack Exchange, Inc.', description='provider:')

In [6]:
dblicense_tb = widgets.Text(value='CC BY-SA 3.0', description='license:', disabled=False); dblicense_tb

Text(value='CC BY-SA 3.0', description='license:')

In [7]:
dbauthor_tb = widgets.Text(value='Stack Exchange Community', description='author:', disabled=False); dbauthor_tb

Text(value='Stack Exchange Community', description='author:')

In [8]:
dbdesc_tb = widgets.Text(value='This is an anonymized dump of all user-contributed content on the Stack Exchange network. Each site is formatted as a separate archive consisting of XML files zipped via 7-zip using bzip2 compression. Each site archive includes Posts, Users, Votes, Comments, PostHistory and PostLinks.', 
                            description='description:', disabled=False); dbdesc_tb

Text(value='This is an anonymized dump of all user-contributed content on the Stack Exchange network. Each sit…

## Data Collection Model class creation

In [9]:
%run ../libs/dataCollectionModel.py

DataCollection class created!
Release class created!
Item class created!


## Harvest Data 

In [10]:
dblocal_tb = widgets.Text(value='../releases/', description='path:', disabled=False); dblocal_tb

Text(value='../releases/', description='path:')

### Create item objects and append them to a list

In [11]:
releaseList = []
releaseNum = 0
#for r in getReleaseFolders(dblocal_tb.value):
for r in os.listdir(dblocal_tb.value): 
    r = dblocal_tb.value + r
    itemList = []
    for f in os.listdir(r): 
        f = r + "/" + f
        print (f)
        df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(f) # note: custom schema should be defined in deduction phase (later)
        content = df.dtypes # list N-tuple <attribute, data type>
        size = os.path.getsize(f) # get file size in Bytes
        ## Instance
        itemList.append(Item(url_tb.value, f, content, size)) # _id, name, content, size                            
    
    # Get release size 
    releaseSize = 0
    releaseNum += 1
    for i in itemList:
        releaseSize = i.size + releaseSize
    releaseList.append(Release(url_tb.value, releaseNum, None, itemList, releaseSize)) #_id, releaseNum, publicationDate, itemList, size
    releaseList[releaseNum-1].printInfo()

../releases/jan-03-04_2018/USERS_jan-03-04_2018.csv
../releases/jan-03-04_2018/POSTS_jan-03-04_2018.csv
../releases/jan-03-04_2018/COMMENTS_jan-03-04_2018.csv
../releases/jan-03-04_2018/BADGES_jan-03-04_2018.csv
../releases/jan-03-04_2018/VOTES_jan-03-04_2018.csv
_id  = https://data.stackexchange.com/stackoverflow/query/new
releaseNum  = 1
publicationDate  = None
List of items = [<__main__.Item object at 0x7f7cc1c7e358>, <__main__.Item object at 0x7f7cc1c7e898>, <__main__.Item object at 0x7f7cc19c4320>, <__main__.Item object at 0x7f7cc19c4710>, <__main__.Item object at 0x7f7cc19c4f60>]
size = 30307950 Bytes
../releases/jan-01-02_2018/POSTS_jan-01-02_2018.csv
../releases/jan-01-02_2018/COMMENTS_jan-01-02_2018.csv
../releases/jan-01-02_2018/USERS_jan-01-02_2018.csv
../releases/jan-01-02_2018/BADGES_jan-01-02_2018.csv
../releases/jan-01-02_2018/VOTES_jan-01-02_2018.csv
_id  = https://data.stackexchange.com/stackoverflow/query/new
releaseNum  = 2
publicationDate  = None
List of items = [<_

### Create release objects and append them in a list 

### Create data collection object
Using release size since both release and collection have the same data <br />
Adding parameter manually, consider extracting them programmatically

In [12]:
#----------- COLLECTION ---------------------------------------------------------------------------------------------
# Ask for database name, provider, licence, author and description
# _id = url maybe change for a hdfs url
                                #_id, name, provider, licence, size, author, description, releaseList
dataCollection = DataCollection(url_tb.value, "stats.stackexchange", dbprovider_tb.value, dblicense_tb.value, None, dbauthor_tb.value, dbdesc_tb.value, releaseList) 
dataCollection.printInfo()
#--------------------------------------------------------------------------------------------------------------------

_id  = https://data.stackexchange.com/stackoverflow/query/new
name  = stats.stackexchange
provider  = Stack Exchange, Inc.
licence  = CC BY-SA 3.0
size = None Bytes
author  = Stack Exchange Community
description  = This is an anonymized dump of all user-contributed content on the Stack Exchange network. Each site is formatted as a separate archive consisting of XML files zipped via 7-zip using bzip2 compression. Each site archive includes Posts, Users, Votes, Comments, PostHistory and PostLinks.
list of releases = [<__main__.Release object at 0x7f7cc1c7bd68>, <__main__.Release object at 0x7f7cc19c4f98>, <__main__.Release object at 0x7f7cc19c4be0>]


### Serialize data collection objet to JSON

In [13]:
import json

dataCollectionJson = json.dumps(dataCollection, default=lambda x: x.__dict__, indent=3)
dataCollectionJson = json.loads(dataCollectionJson)
print (dataCollectionJson)

{'author': 'Stack Exchange Community', '_id': 'https://data.stackexchange.com/stackoverflow/query/new', 'size': None, 'releaseList': [{'itemList': [{'content': [['Id', 'string'], ['Reputation', 'string'], ['CreationDate', 'string'], ['DisplayName', 'string'], ['LastAccessDate', 'string'], ['WebsiteUrl', 'string'], ['Location', 'string'], ['AboutMe', 'string'], ['Views', 'string'], ['UpVotes', 'string'], ['DownVotes', 'string'], ['ProfileImageUrl', 'string'], ['EmailHash', 'string'], ['AccountId', 'string']], '_id': 'https://data.stackexchange.com/stackoverflow/query/new', 'size': 935450, 'name': '../releases/jan-03-04_2018/USERS_jan-03-04_2018.csv'}, {'content': [['Id', 'string'], ['PostTypeId', 'string'], ['AcceptedAnswerId', 'string'], ['ParentId', 'string'], ['CreationDate', 'string'], ['DeletionDate', 'string'], ['Score', 'string'], ['ViewCount', 'string'], ['Body', 'string'], ['OwnerUserId', 'string'], ['OwnerDisplayName', 'string'], ['LastEditorUserId', 'string'], ['LastEditorDis

### Store meta-data file in a MongoDB Atlas Cluster

In [14]:
import json
import pymongo
from pymongo import MongoClient
import pprint

import urllib.parse

# Creates a client for the primary sandbox from cluster host cluster0-nlbcx.mongodb.net
client = MongoClient("mongodb://adminUser:xpass@cluster0-shard-00-00-nlbcx.mongodb.net:27017/?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin")

db = client.test 
db = client['stackoverflow-dump-db']
collection = db['stackoverflow-stats-metadata-5']

pid = collection.insert_one(dataCollectionJson).inserted_id  # this id can replace the url in datacollectionmodel class

pprint.pprint(collection.find_one())

client.close()

{'_id': 'https://data.stackexchange.com/stackoverflow/query/new',
 'author': 'Stack Exchange Community',
 'description': 'This is an anonymized dump of all user-contributed content on '
                'the Stack Exchange network. Each site is formatted as a '
                'separate archive consisting of XML files zipped via 7-zip '
                'using bzip2 compression. Each site archive includes Posts, '
                'Users, Votes, Comments, PostHistory and PostLinks.',
 'licence': 'CC BY-SA 3.0',
 'name': 'stats.stackexchange',
 'provider': 'Stack Exchange, Inc.',
 'releaseList': [{'_id': 'https://data.stackexchange.com/stackoverflow/query/new',
                  'itemList': [{'_id': 'https://data.stackexchange.com/stackoverflow/query/new',
                                'content': [['Id', 'string'],
                                            ['Reputation', 'string'],
                                            ['CreationDate', 'string'],
                                 