# Creating a basic end to end Extract Transform and Load(ETL) pipeline using spark and postgres
### Data is scrapped from Stack overflow into the following tables:
- Questions
- Answers
- Users

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


# Extract stage:

using pyspark 


In [2]:
# creating a spark session

spark = (SparkSession.builder
                     .appName("Stack Overflow Data Wrangling")
                     .config("spark.jars","jars/postgresql-42.2.14.jar")#this should aid with database connection
                     .getOrCreate()
        )



In [3]:
users_df = spark.read.csv('../data/stackoverflow/users.csv', inferSchema=True, header=True, escape='"',multiLine=True)
questions_df = spark.read.csv('../data/stackoverflow/questions.csv',inferSchema=True, header=True, escape= '"', multiLine=True)
answers_df = spark.read.csv('../data/stackoverflow/answers.csv',inferSchema=True, header=True, escape= '"', multiLine=True)

# Data Transform Stage:

- Step 1: Select users from one country only
- Step 2: Extract the country and city into new columns.
- Step 3: Join the result with the questions and select only questions with at least 20 view counts
- Step 4: Join the answers to the result

In [4]:
users_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- display_name: string (nullable = true)
 |-- reputation: integer (nullable = true)
 |-- website_url: string (nullable = true)
 |-- location: string (nullable = true)
 |-- about_me: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- up_votes: integer (nullable = true)
 |-- down_votes: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated_at: string (nullable = true)



In [5]:
users_df.count()

243028

In [6]:
# renaming the id column because of a join.. later on
users_df = users_df.withColumnRenamed("id","user_id")\
                   .withColumnRenamed("created_at","user_created_at")\

In [7]:
users_df.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'updated_at']

In [8]:
users_df[['location']].show(5)

+--------------------+
|            location|
+--------------------+
|Bangalore, Karnat...|
|              Canada|
|Pennsylvania, Uni...|
|                null|
|New Delhi, Delhi,...|
+--------------------+
only showing top 5 rows



### Step 1: Select users from one country only

In [9]:
# filtering in spark is soooooo cooooooool
india_users = users_df.filter(users_df.location.contains('India'))

In [10]:
india_users.count()

21754

### Step 2: Extract the country and city into new columns.

In [11]:
# spliting the location to extract the City value...
india_users_city_added = india_users.withColumn("City", F.split(india_users.location, ',')[0].alias("City"))

In [12]:
india_users_city_added.show(2)

+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+---------+
|user_id|display_name|reputation|         website_url|            location|about_me|views|up_votes|down_votes|           image_url|    user_created_at|         updated_at|     City|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+---------+
|8357266|      suryan|         7|https://twitter.c...|Bangalore, Karnat...|    null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|Bangalore|
|6504306|       A.Raw|         4|                null|New Delhi, Delhi,...|    null|   10|       0|         0|https://i.stack.i...|2016-06-23 12:58:03|2019-10-12 06:59:32|New Delhi|
+-------+------------+----------+--------------------+--------------------+--------+-----+

In [13]:
# Doing the same thing to get the Column value.
# ok so after splitting the [-1] wasn't working to get the last index so i had to reverse the list before accessing the first value which would be the last in this case 

india_users_country_added = india_users_city_added.withColumn("Country",F.reverse(F.split(india_users_city_added.location, ','))[0].alias("Country"))

In [14]:
india_users_country_added.show(2)

+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+---------+-------+
|user_id|display_name|reputation|         website_url|            location|about_me|views|up_votes|down_votes|           image_url|    user_created_at|         updated_at|     City|Country|
+-------+------------+----------+--------------------+--------------------+--------+-----+--------+----------+--------------------+-------------------+-------------------+---------+-------+
|8357266|      suryan|         7|https://twitter.c...|Bangalore, Karnat...|    null|    8|       0|         0|https://www.grava...|2017-07-24 10:55:23|2019-06-19 05:00:16|Bangalore|  India|
|6504306|       A.Raw|         4|                null|New Delhi, Delhi,...|    null|   10|       0|         0|https://i.stack.i...|2016-06-23 12:58:03|2019-10-12 06:59:32|New Delhi|  India|
+-------+------------+----------+-----------------

In [15]:
india_users_country_added.select('location','City','Country').show(5)

+--------------------+---------+-------+
|            location|     City|Country|
+--------------------+---------+-------+
|Bangalore, Karnat...|Bangalore|  India|
|New Delhi, Delhi,...|New Delhi|  India|
|Gharaunda, Haryan...|Gharaunda|  India|
|    New Delhi, India|New Delhi|  India|
|Jalandhar, Punjab...|Jalandhar|  India|
+--------------------+---------+-------+
only showing top 5 rows



In [16]:
india_users_country_added.columns

['user_id',
 'display_name',
 'reputation',
 'website_url',
 'location',
 'about_me',
 'views',
 'up_votes',
 'down_votes',
 'image_url',
 'user_created_at',
 'updated_at',
 'City',
 'Country']

In [17]:
india_users_country_added.count()

21754

### Step 3: Join the result with the questions and select only questions with at least 20 view counts

In [18]:
questions_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- created_at: string (nullable = true)



In [19]:
questions_df= questions_df.withColumnRenamed("id","question_id")\
                            .withColumnRenamed("created_at","question_created_at")\
                            .withColumnRenamed("comment_count","question_comment_count")\
                            .withColumnRenamed("score","question_score")\
                            .withColumnRenamed("body","question_body")

In [20]:
questions_users_df = questions_df.join(india_users_country_added, on=['user_id'],how='inner')

In [21]:
questions_users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- question_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- question_body: string (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- question_score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- question_comment_count: integer (nullable = true)
 |-- question_created_at: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- reputation: integer (nullable = true)
 |-- website_url: string (nullable = true)
 |-- location: string (nullable = true)
 |-- about_me: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- up_votes: integer (nullable = true)
 |-- down_votes: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- user_created_at: string (nullable = true)
 |-- updated_at: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)



In [22]:
questions_at_least_20_views = questions_users_df.filter(questions_users_df['view_count'] > 3)

### Step 4: Join the answers to the result

In [23]:
answers_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- question_id: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- created_at: string (nullable = true)



In [24]:
# renaming for the sake of the joining....
answers_df= answers_df.withColumnRenamed("id","answer_id")\
                        .withColumnRenamed("score","answer_score")\
                        .withColumnRenamed("comment_count","answer_comment_count")\
                        .withColumnRenamed("created_at","answer_created_at")\
                        .withColumnRenamed("body","answer_body")

In [25]:
answers_added = answers_df.join(questions_at_least_20_views, on=['question_id', 'user_id'], how='inner')

In [26]:
answers_added.count()

2411

In [27]:
answers_added.printSchema()

root
 |-- question_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- answer_id: integer (nullable = true)
 |-- answer_body: string (nullable = true)
 |-- answer_score: integer (nullable = true)
 |-- answer_comment_count: integer (nullable = true)
 |-- answer_created_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- question_body: string (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- question_score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- question_comment_count: integer (nullable = true)
 |-- question_created_at: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- reputation: integer (nullable = true)
 |-- website_url: string (nullable = true)
 |-- location: string (nullable = true)
 |-- about_me: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- up_votes: integer (nullable = true)
 |-- down_votes: integer (nullable = true)
 |-- image_url: st

In [28]:
final_df = answers_added

In [29]:
answers_added.registerTempTable('new_df')
spark.sql("Select min(updated_at) as minimum_updated_time from new_df").show()

+--------------------+
|minimum_updated_time|
+--------------------+
| 2019-01-11 05:02:30|
+--------------------+



#### Data Definition part

Using SQL
- Creating a schema based on the resultant dataframe
- Create a table called results

# Data loading part

- using spark write the resultant dataframe to the table created in the database:

In [32]:
final_df.write.format("jdbc").options(
    url='jdbc:postgresql://localhost:5432/postgres',
    driver='org.postgresql.Driver',
    user='postgres',
    password='pass',
    dbtable='stackoverflow_filtered.results'
).save(mode="append")

### Scratch part

In [97]:
# ok so what if i want to ge the length of each array in the splitted column appended to this as a new column
# the pseudocode im looking at is
# if the length of the array is 2 the first index is the city and the other is the country
# if the length is 1 then that's the country
# for others i don't know yet....

# ok a rewrite split the location column
# assume the first index is the city and the last index is the country


In [67]:
df.count()

243028

In [83]:
# ok so filtering based on the condition that the length column is greater than 3
filtered_df = df.where(F.size(F.col("splitted")) >= 2)

In [None]:
# ok so now 
filtered_df

In [78]:
result.count()

88554

In [81]:
type(result)

pyspark.sql.dataframe.DataFrame

In [82]:
# writing to a csv
result.toPandas().to_csv('result.csv')

In [126]:
# Ok i'm saving this code till i fully understand how it works  :)))))))

india_users_country_added.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in india_users_country_added.columns]).show()

+---+------------+----------+-----------+--------+--------+-----+--------+----------+---------+----------+----------+----+-------+
| id|display_name|reputation|website_url|location|about_me|views|up_votes|down_votes|image_url|created_at|updated_at|City|Country|
+---+------------+----------+-----------+--------+--------+-----+--------+----------+---------+----------+----------+----+-------+
|  0|           0|         0|      17654|       0|   11683|    0|       0|         0|     1936|         0|         0|   0|      0|
+---+------------+----------+-----------+--------+--------+-----+--------+----------+---------+----------+----------+----+-------+

