# DEMO: Stackoverflow dump

## CURARE data collection model

#### Type your dataset's URL or leave default value

In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/88/01/a37e827c2d80c6a754e40e99b9826d978b55254cc6c6672b5b08f2e18a7f/pyspark-2.4.0.tar.gz (213.4MB)
[K    14% |████▊                           | 31.7MB 1.3MB/s eta 0:02:196

In [None]:
import shutil
import os
from os import listdir

from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [None]:
import ipywidgets as widgets

In [None]:
url_tb = widgets.Text(value='https://data.stackexchange.com/stackoverflow/query/new', description='Dataset url:', disabled=False); url_tb

#### Type additional information about your dataset or leve default values

In [None]:
dbprovider_tb = widgets.Text(value='Stack Exchange, Inc.', description='provider:', disabled=False); dbprovider_tb

In [None]:
dblicense_tb = widgets.Text(value='CC BY-SA 3.0', description='license:', disabled=False); dblicense_tb

In [None]:
dbauthor_tb = widgets.Text(value='Stack Exchange Community', description='author:', disabled=False); dbauthor_tb

In [None]:
dbdesc_tb = widgets.Text(value='This is an anonymized dump of all user-contributed content on the Stack Exchange network. Each site is formatted as a separate archive consisting of XML files zipped via 7-zip using bzip2 compression. Each site archive includes Posts, Users, Votes, Comments, PostHistory and PostLinks.', 
                            description='description:', disabled=False); dbdesc_tb

## Data Collection Model class creation

In [None]:
%run ../libs/dataCollectionModel.py

## Harvest Data 

In [None]:
dblocal_tb = widgets.Text(value='../releases/', description='path:', disabled=False); dblocal_tb

### Create item objects and append them to a list

In [None]:
releaseList = []
releaseNum = 0
#for r in getReleaseFolders(dblocal_tb.value):
for r in os.listdir(dblocal_tb.value): 
    r = dblocal_tb.value + r
    itemList = []
    for f in os.listdir(r): 
        f = r + "/" + f
        print (f)
        df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(f) # note: custom schema should be defined in deduction phase (later)
        content = df.dtypes # list N-tuple <attribute, data type>
        size = os.path.getsize(f) # get file size in Bytes
        ## Instance
        itemList.append(Item(url_tb.value, f, content, size)) # _id, name, content, size                            
    
    # Get release size 
    releaseSize = 0
    releaseNum += 1
    for i in itemList:
        releaseSize = i.size + releaseSize
    releaseList.append(Release(url_tb.value, releaseNum, None, itemList, releaseSize)) #_id, releaseNum, publicationDate, itemList, size
    releaseList[releaseNum-1].printInfo()

### Create release objects and append them in a list 

### Create data collection object
Using release size since both release and collection have the same data <br />
Adding parameter manually, consider extracting them programmatically

In [None]:
#----------- COLLECTION ---------------------------------------------------------------------------------------------
# Ask for database name, provider, licence, author and description
# _id = url maybe change for a hdfs url
                                #_id, name, provider, licence, size, author, description, releaseList
dataCollection = DataCollection(url_tb.value, "stats.stackexchange", dbprovider_tb.value, dblicense_tb.value, None, dbauthor_tb.value, dbdesc_tb.value, releaseList) 
dataCollection.printInfo()
#--------------------------------------------------------------------------------------------------------------------

### Serialize data collection objet to JSON

In [None]:
import json

dataCollectionJson = json.dumps(dataCollection, default=lambda x: x.__dict__, indent=3)
dataCollectionJson = json.loads(dataCollectionJson)
print (dataCollectionJson)

### Store meta-data file in a MongoDB Atlas Cluster

In [None]:
import json
import pymongo
from pymongo import MongoClient
import pprint

import urllib.parse

# Creates a client for the primary sandbox from cluster host cluster0-nlbcx.mongodb.net
client = MongoClient("mongodb://adminUser:xpass@cluster0-shard-00-00-nlbcx.mongodb.net:27017/?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin")

db = client.test 
db = client['stackoverflow-dump-db']
collection = db['stackoverflow-stats-metadata-5']

pid = collection.insert_one(dataCollectionJson).inserted_id  # this id can replace the url in datacollectionmodel class

pprint.pprint(collection.find_one())

client.close()