In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, explode
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, monotonically_increasing_id
from pyspark.sql.types import TimestampType

import requests
import boto3

### Get Data

In [2]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['KEY']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['SECRET']


In [3]:
input_data = "data/"
output_data = "data/output_data/"
output_data_s3 = "<S3>"


questions_data = input_data + 'questions/*.json'
tags_data = input_data + 'tags/*.json'
users_data = input_data + 'users/*.json'

In [4]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

## Questions Data

In [5]:
df_questions = spark.read.json(questions_data)
df_questions.printSchema()

root
 |-- accepted_answer_id: long (nullable = true)
 |-- answer_count: long (nullable = true)
 |-- bounty_amount: long (nullable = true)
 |-- bounty_closes_date: long (nullable = true)
 |-- closed_date: long (nullable = true)
 |-- closed_reason: string (nullable = true)
 |-- collectives: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- external_links: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- link: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- slug: string (nullable = true)
 |    |    |-- tags: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- community_owned_date: long (nullable = true)
 |-- content_license: string (nullable = true)
 |-- creation_date: lo

In [6]:
df_questions.head(1)

[Row(accepted_answer_id=70271, answer_count=6, bounty_amount=None, bounty_closes_date=None, closed_date=None, closed_reason=None, collectives=[Row(description='Google Cloud provides organizations with leading infrastructure, platform capabilities and industry solutions to help them solve their most critical business problems.', external_links=[Row(link='https://cloud.google.com/developers', type='website'), Row(link='https://googlecloud-community.slack.com/', type='support'), Row(link='https://twitter.com/googlecloudtech', type='twitter'), Row(link='https://facebook.com/googlecloud', type='facebook'), Row(link='https://instagram.com/googlecloud', type='instagram')], link='/collectives/google-cloud', name='Google Cloud', slug='google-cloud', tags=['google-cloud-python', 'google-app-engine', 'google-cloud-platform', 'google-cloud-save', 'google-cloud-nl', 'google-cloud-resource-manager', 'google-cloud-memorystore', 'google-cloud-ml-engine', 'google-cloud-spanner-emulator', 'maven-jib', '

In [7]:
question_fields = ["question_id", "title", "view_count", "creation_date", "owner.user_id", "is_answered", explode("tags").alias("tag")]
questions_table = df_questions.select(question_fields).dropDuplicates()

In [8]:
questions_table.head(5)

[Row(question_id=48668, title='How should anonymous types be used in C#?', view_count=5255, creation_date=1220812472, owner.user_id=4883, is_answered=True, tag='c#'),
 Row(question_id=48733, title='How to maintain Hibernate cache consistency running two Java applications?', view_count=11783, creation_date=1220818151, owner.user_id=48310, is_answered=True, tag='caching'),
 Row(question_id=48872, title='Why/when should you use nested classes in .net? Or shouldn&#39;t you?', view_count=47127, creation_date=1220828480, owner.user_id=100, is_answered=True, tag='fxcop'),
 Row(question_id=47786, title='Google App Engine: Is it possible to do a Gql LIKE query?', view_count=42056, creation_date=1220731524, owner.user_id=366, is_answered=True, tag='google-app-engine'),
 Row(question_id=48053, title='Is there any alternative to using % (modulus) in C/C++?', view_count=56888, creation_date=1220754689, owner.user_id=445087, is_answered=True, tag='modulo')]

In [96]:
questions_table.write.parquet(output_data + 'questions/', mode='overwrite')

## Data Quality

In [47]:
def check_data_quality(dataset):
    """Check for data quality

    :param dataset: Dataset to be checked
    """
    if dataset == 'questions':
        table_field = 'question_id'
        df_table = questions_table
    elif dataset == 'users':
        table_field = 'user_id'
        df_table = users_table

    print("Start data quality checks...")
    quality_results = { "table_count": 0, "table": ""}
    
    # Chack table
    print("Checking {} table...".format(dataset))
    df_table.createOrReplaceTempView("df_table")
    query_nulls = ("""
        SELECT  COUNT(*)
        FROM df_table
        WHERE {} IS NULL OR {} == ""
    """).format(table_field, table_field)
    table_check_nulls = spark.sql(query_nulls)

    # Check that table has > 0 rows
    table_check_count = spark.sql("""
        SELECT  COUNT(*)
        FROM df_table
    """)
    if table_check_nulls.collect()[0][0] > 0 \
        & table_check_count.collect()[0][0] < 1:
        quality_results['table_count'] = table_check_count.collect()[0][0]
        quality_results['table'] = "NOK"
    else:
        quality_results['table_count'] = table_check_count.collect()[0][0]
        quality_results['table'] = "OK"

    print("NULLS:")
    table_check_nulls.show(1)
    print("ROWS:")
    table_check_count.show(1)

    return quality_results

In [48]:
check_data_quality('questions')

Start data quality checks...
Checking questions table...
NULLS:
+--------+
|count(1)|
+--------+
|       0|
+--------+

ROWS:
+--------+
|count(1)|
+--------+
|   28602|
+--------+



{'table_count': 28602, 'table': 'OK'}

## Users Data

In [50]:
df_users = spark.read.json(users_data)
df_users.printSchema()

root
 |-- accept_rate: long (nullable = true)
 |-- account_id: long (nullable = true)
 |-- badge_counts: struct (nullable = true)
 |    |-- bronze: long (nullable = true)
 |    |-- gold: long (nullable = true)
 |    |-- silver: long (nullable = true)
 |-- collectives: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- collective: struct (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- external_links: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- link: string (nullable = true)
 |    |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |-- link: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- slug: string (nullable = true)
 |    |    |    |-- tags: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |-- role: string (nullable 

In [51]:
df_users.head(1)

[Row(accept_rate=None, account_id=-1, badge_counts=Row(bronze=0, gold=0, silver=1), collectives=None, creation_date=1217462400, display_name='Community', is_employee=False, last_access_date=1219709813, last_modified_date=1623827663, link='https://stackoverflow.com/users/-1/community', location='on the server farm', profile_image='https://www.gravatar.com/avatar/a007be5a61f6aa8f3e85ae2fc18dd66e?s=128&d=identicon&r=PG', reputation=1, reputation_change_day=0, reputation_change_month=0, reputation_change_quarter=0, reputation_change_week=0, reputation_change_year=0, timed_penalty_date=None, user_id=-1, user_type='moderator', website_url='https://meta.stackexchange.com')]

In [52]:
users_fields = ["user_id", "display_name", "reputation", "user_type", "location"]
users_table = df_users.select(users_fields).dropDuplicates()

In [93]:
users_table.head(5)

[Row(user_id=23, display_name='Jax', reputation=6467, user_type='registered', location='Charlotte, NC, United States'),
 Row(user_id=25, display_name='CodingWithoutComments', reputation=33690, user_type='registered', location='Seattle, Washington United States'),
 Row(user_id=1038, display_name='maerch', reputation=1985, user_type='registered', location='Europe'),
 Row(user_id=9832, display_name='Dave Rolsky', reputation=4464, user_type='registered', location='United States'),
 Row(user_id=845, display_name='Barry', reputation=2043, user_type='registered', location='Daytona Beach, Florida, United States')]

In [26]:
users_table.write.parquet(output_data + 'users/', mode='overwrite')

In [53]:
check_data_quality('users')

Start data quality checks...
Checking users table...
NULLS:
+--------+
|count(1)|
+--------+
|       0|
+--------+

ROWS:
+--------+
|count(1)|
+--------+
|    9900|
+--------+



{'table_count': 9900, 'table': 'OK'}

## Tags Data

In [27]:
df_tags = spark.read.json(tags_data)
df_tags.printSchema()

root
 |-- collectives: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- external_links: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- link: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true)
 |    |    |-- link: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- slug: string (nullable = true)
 |    |    |-- tags: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- count: long (nullable = true)
 |-- has_synonyms: boolean (nullable = true)
 |-- is_moderator_only: boolean (nullable = true)
 |-- is_required: boolean (nullable = true)
 |-- name: string (nullable = true)



In [28]:
df_tags.head(1)

[Row(collectives=None, count=38665, has_synonyms=True, is_moderator_only=False, is_required=False, name='join')]

In [30]:
tags_fields = ["count", "name"]
tags_table = df_tags.select(tags_fields).dropDuplicates()

In [31]:
tags_table.head(5)

[Row(count=1189, name='sha'),
 Row(count=1179, name='immutable.js'),
 Row(count=1160, name='onsen-ui'),
 Row(count=546, name='angular-bootstrap'),
 Row(count=541, name='jsplumb')]

In [32]:
tags_table.write.parquet(output_data + 'tags/', mode='overwrite')

## Upload data to S3

In [33]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [100]:
path = output_data + 'questions/'

files = os.listdir(path)

for filename in files:
    if not filename.startswith('.'):
        upload_file(path + filename, output_data_s3, 'questions/{}'.format(filename))

In [43]:
path = output_data + 'users/'

files = os.listdir(path)

for filename in files:
    if not filename.startswith('.'):
        upload_file(path + filename, output_data_s3, 'users/{}'.format(filename))

In [44]:
path = output_data + 'tags/'

files = os.listdir(path)

for filename in files:
    if not filename.startswith('.'):
        upload_file(path + filename, output_data_s3, 'tags/{}'.format(filename))