# Basic Extract Transform and Load(ETL) pipeline using spark and postgres
### Data is scrapped from Stack overflow into the following tables:
- Questions
- Answers
- Users

#### This script is divided into 4 steps to illustrate the process


### Step 1: Data Extraction

In [1]:
import pyspark

from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
#initializing spark a spark session
spark = ( 
    SparkSession.builder
                .appName("Stack Overflow Data Wrangling")
                .config("spark.jars", "../jars/")
                .getOrCreate()
)

In [3]:
#loading the data in dataframes
answers = spark.read.csv("../data/stackoverflow/answers.csv",  header=True, inferSchema=True,multiLine=True)
questions = spark.read.csv("../data/stackoverflow/questions.csv", header=True, inferSchema=True,multiLine=True)
users = spark.read.csv("../data/stackoverflow/users.csv", header=True, inferSchema=True,multiLine=True)

In [4]:
answers.columns

['id',
 'user_id',
 'question_id',
 'body',
 'score',
 'comment_count',
 'created_at']

In [5]:
answers.take(2)

[Row(id='53999517', user_id='1771994', question_id='53999275', body='"<p>The <code>for..of</code> loop you have in your code isn\'t needed. Just use the code you already have and <code>num</code> as the <code>quotes</code> array index value. I added button to demonstrate how the function will only return a single value:</p>\n\n<p><div class=""snippet"" data-lang=""js"" data-hide=""false"" data-console=""true"" data-babel=""false"">', score=None, comment_count=None, created_at=None),
 Row(id='<div class=""snippet-code"">', user_id=None, question_id=None, body=None, score=None, comment_count=None, created_at=None)]

In [6]:
users.columns

['id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'created_at',
 'updated_at']

In [7]:
questions.columns

['id',
 'user_id',
 'title',
 'body',
 'accepted_answer_id',
 'score',
 'view_count',
 'comment_count',
 'created_at']

In [8]:
#transforming the id column names as well as the timestamps too to avoid the ambiguity
answers = answers.withColumnRenamed('id', 'answer_id').withColumnRenamed('created_at', 'answer_created_at').withColumnRenamed('body','answer_body').withColumnRenamed('score','answer_score').withColumnRenamed('comment_count','answer_comment_count')
questions = questions.withColumnRenamed('id', 'question_id').withColumnRenamed('created_at', 'question_created_at').withColumnRenamed('body','question_body').withColumnRenamed('score','question_score')
users = users.withColumnRenamed('id', 'user_id').withColumnRenamed('created_at', 'user_created_at').withColumnRenamed('updated_at', 'user_updated_at')

#### Step 2: Data Transformation

In [9]:
#getting only indian users..
india_users = users.filter(users.location.contains('India'))

In [10]:
india_users.select("location").show()

+--------------------+
|            location|
+--------------------+
|Bangalore, Karnat...|
|Jalandhar, Punjab...|
|Indore, Madhya Pr...|
|Chennai, Tamil Na...|
|Kolkata, West Ben...|
|Bangalore, Karnat...|
|Pune, Maharashtra...|
|Chandan Nagar, Pu...|
|Poonamallee, Chen...|
|Vellore, Tamil Na...|
|Bengaluru, Karnat...|
|Chennai, Tamil Na...|
|        Delhi, India|
|Coimbatore, Tamil...|
|Pune, Maharashtra...|
|Chennai, Tamil Na...|
|Mumbai, Maharasht...|
|Pune, Maharashtra...|
|Indore, Madhya Pr...|
|Coimbatore, Tamil...|
+--------------------+
only showing top 20 rows



In [11]:
#extracting the country and city into new columns
cols = F.split(india_users['location'], ',' )
india_users = india_users.withColumn('city', cols.getItem(0))
india_users = india_users.withColumn('country', cols.getItem(2))

In [12]:
# rows where city was not quoted the country are being taken as None
india_users.select(['location','city','country']).show()

+--------------------+-------------+------------+
|            location|         city|     country|
+--------------------+-------------+------------+
|Bangalore, Karnat...|    Bangalore|       India|
|Jalandhar, Punjab...|    Jalandhar|       India|
|Indore, Madhya Pr...|       Indore|       India|
|Chennai, Tamil Na...|      Chennai|       India|
|Kolkata, West Ben...|      Kolkata|       India|
|Bangalore, Karnat...|    Bangalore|       India|
|Pune, Maharashtra...|         Pune|       India|
|Chandan Nagar, Pu...|Chandan Nagar| Maharashtra|
|Poonamallee, Chen...|  Poonamallee|  Tamil Nadu|
|Vellore, Tamil Na...|      Vellore|       India|
|Bengaluru, Karnat...|    Bengaluru|       India|
|Chennai, Tamil Na...|      Chennai|       India|
|        Delhi, India|        Delhi|        null|
|Coimbatore, Tamil...|   Coimbatore|       India|
|Pune, Maharashtra...|         Pune|       India|
|Chennai, Tamil Na...|      Chennai|       India|
|Mumbai, Maharasht...|       Mumbai|       India|


In [13]:
# an inner join of the filtered users df with the question df
df = india_users.join(questions, on='user_id', how='left')
df.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'question_id',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at']

In [14]:
# selecting only questions with at least 20 view counts
df = df.filter(df['view_count'] >= 20)

In [15]:
# joining the resultant table to the users answers table
df = df.join(answers, on=['question_id','user_id'], how='left')
df.columns

['question_id',
 'user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'user_updated_at',
 'city',
 'country',
 'title',
 'question_body',
 'accepted_answer_id',
 'question_score',
 'view_count',
 'comment_count',
 'question_created_at',
 'answer_id',
 'answer_body',
 'answer_score',
 'answer_comment_count',
 'answer_created_at']

In [16]:
df.count()

26

In [17]:
df.dtypes

[('question_id', 'string'),
 ('user_id', 'string'),
 ('display_name', 'string'),
 ('reputation', 'string'),
 ('website_url', 'string'),
 ('location', 'string'),
 ('about_me', 'string'),
 ('views', 'string'),
 ('up_votes', 'string'),
 ('down_votes', 'string'),
 ('image_url', 'string'),
 ('user_created_at', 'string'),
 ('user_updated_at', 'string'),
 ('city', 'string'),
 ('country', 'string'),
 ('title', 'string'),
 ('question_body', 'string'),
 ('accepted_answer_id', 'string'),
 ('question_score', 'string'),
 ('view_count', 'string'),
 ('comment_count', 'string'),
 ('question_created_at', 'string'),
 ('answer_id', 'string'),
 ('answer_body', 'string'),
 ('answer_score', 'string'),
 ('answer_comment_count', 'string'),
 ('answer_created_at', 'string')]

### Step 3: Data Loading

In [21]:
#writing to the database
df.write.format("jdbc").options(
    url='jdbc:postgresql://localhost:5433/postgres',
    driver='org.postgresql.Driver',
    user='postgres',
    password='cl1f4d',
    dbtable='stackoverflow_filtered.results'
).save('mode')

<pyspark.sql.readwriter.DataFrameWriter at 0x206546c6240>