# Basic Extract Transform and Load(ETL) pipeline using spark and postgres
### Data is scrapped from Stack overflow into the following tables:
- Questions
- Answers
- Users

#### This script is divided into 4 steps to illustrate the process


### Step 1: Data Extraction

In [1]:
import pyspark

from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
#initializing spark a spark session
spark = ( 
    SparkSession.builder
                .appName("Stack Overflow Data Wrangling")
                .config("spark.jars", "../jars/")
                .getOrCreate()
)

In [77]:
#loading the data in dataframes
answers = spark.read.csv("../data/stackoverflow/answers.csv",  header=True, inferSchema=True,multiLine=True)
questions = spark.read.csv("../data/stackoverflow/questions.csv", header=True, inferSchema=True,multiLine=True)
users = spark.read.csv("../data/stackoverflow/users.csv", header=True, inferSchema=True,multiLine=True)

In [78]:
#transforming the id column names as well as the timestamps too to avoid the ambiguity
answers = answers.withColumnRenamed('id', 'answer_id').withColumnRenamed('created_at', 'answer_created_at').withColumnRenamed('body','answer_body')
questions = questions.withColumnRenamed('id', 'question_id').withColumnRenamed('created_at', 'question_created_at').withColumnRenamed('body','question_body')
users = users.withColumnRenamed('id', 'user_id').withColumnRenamed('created_at', 'user_created_at').withColumnRenamed('updated_at', 'user_updated_at')

#### Step 2: Data Transformation

In [79]:
#getting only indian users..
india_users = users.filter(users.location.contains('India'))

In [80]:
india_users.show(10)

+--------+------------------+----------+--------------------+--------------------+--------------------+-----+--------+----------+--------------------+-------------------+-------------------+
| user_id|      display_name|reputation|         website_url|            location|            about_me|views|up_votes|down_votes|           image_url|    user_created_at|    user_updated_at|
+--------+------------------+----------+--------------------+--------------------+--------------------+-----+--------+----------+--------------------+-------------------+-------------------+
| 8357266|            suryan|         7|https://twitter.c...|Bangalore, Karnat...|                null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|
| 6504306|             A.Raw|         4|                null|New Delhi, Delhi,...|                null|   10|       0|         0|https://i.stack.i...|2016-06-23 12:58:03|2019-10-12 06:59:32|
|10260743|     Kartik Juneja|         3|     

In [81]:
#extracting the country and city into new columns
cols = F.split(india_users['location'], ',' )
india_users = india_users.withColumn('city', cols.getItem(0))
india_users = india_users.withColumn('country', cols.getItem(2))

In [82]:
# rows where city was not quoted the country are being taken as None
india_users.take(50)

[Row(user_id='8357266', display_name='suryan', reputation='7', website_url='https://twitter.com/suryan989', location='Bangalore, Karnataka, India', about_me=None, views='8', up_votes='0', down_votes='0', image_url='https://www.gravatar.com/avatar/73f771bf22784bb7f317b3309f48741a?s=128&d=identicon&r=PG&f=1', user_created_at='2017-07-24 10:55:23', user_updated_at='2019-06-19 05:00:16', city='Bangalore', country=' India'),
 Row(user_id='6504306', display_name='A.Raw', reputation='4', website_url=None, location='New Delhi, Delhi, India', about_me=None, views='10', up_votes='0', down_votes='0', image_url='https://i.stack.imgur.com/QxJgz.jpg?s=128&g=1', user_created_at='2016-06-23 12:58:03', user_updated_at='2019-10-12 06:59:32', city='New Delhi', country=' India'),
 Row(user_id='10260743', display_name='Kartik Juneja', reputation='3', website_url=None, location='Gharaunda, Haryana, India', about_me='<p>4th year Btech Student\nlooking for my codes solutions.</p>\n', views='2', up_votes='0', 

In [83]:
# an inner join of the filtered users df with the question df
df = india_users.join(questions, on='user_id', how='left')
df.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'question_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'score',
 'view_count',
 'comment_count',
 'question_created_at']

In [84]:
# selecting only questions with at least 20 view counts
df = df.filter(df['view_count'] >= 20)

In [87]:
# joining the resultant table to the users answers table
df = df.join(answers, on='user_id', how='left')
df.columns

['user_id',
 'question_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'title',
 'question_body',
 'accepted_answer_id',
 'score',
 'view_count',
 'comment_count',
 'question_created_at',
 'answer_id',
 'question_id',
 'answer_body',
 'score',
 'comment_count',
 'answer_created_at',
 'answer_id',
 'user_id',
 'answer_body',
 'score',
 'comment_count',
 'answer_created_at',
 'answer_id',
 'question_id',
 'answer_body',
 'score',
 'comment_count',
 'answer_created_at']

### Step 3: Data Loading