# Task

Investigate a problem related to reading data with schema.

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Debugging-II')
    .getOrCreate()
)

In [None]:
spark.version

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

sample_data_with_metadata = os.path.join(project_path, 'data/sample_data_with_metadata')

1. Read data from the `sample_data_with_metadata`, the format of the files is `JSON` and the schema is provided bellow
2. Find out how many records are there that have in the user_metadata.reputation column value 0.
3. Read the data twice, first with the schema and then without the schema
4. Can you see a problem? If yes, can you explain what happened and fix it?

In [None]:
data_schema = StructType([
    StructField('question_id', LongType()),
    StructField('user_id', LongType()),
    StructField('creation_date', TimestampType()),
    StructField('comments', LongType()),
    StructField('user_metadata', StructType([
        StructField('has_location', BooleanType()),
        StructField('upvotes', LongType()),
        StructField('reputation', LongType()),
    ]))
])

In [None]:
# read the data with the schema and filter for user_metadata.upovotes == 0

(
    spark.read
    .schema(data_schema)
    .format('json')
    .load('/Users/david.vrba/spark-trainings/Apache-Spark-for-Data-Engineers/data/test/untitled.json')
    .filter(col('user_metadata.upvotes') == 0)
).count()

In [None]:
# read the data without the schema and filter for user_metadata.upovotes == 0

(
    spark.read.format('json')
    .load('/Users/david.vrba/spark-trainings/Apache-Spark-for-Data-Engineers/data/test/untitled.json')
    .filter(col('user_metadata.upvotes') == 0)
).count()

In [None]:
# The problem is that the user_id = 3416503 has value 0 instead of False in the column `has_location`, 
# it means that the data doesn't follow the schema and the entire struct becomes NULL
# if you let spark infer the schema, Spark will infer string and is able to read it

(
    spark.read.format('json')
    .load('/Users/david.vrba/spark-trainings/Apache-Spark-for-Data-Engineers/data/test/untitled.json')
    .filter(col('user_id') == 3416503)
).show(n=50)

In [None]:
# If we want to use the boolean data type in the has_location column we can read it as string and then cast it 
# to boolean, this is possible for values 0/1

metadata_schema = StructType([
    StructField('has_location', BooleanType()),
    StructField('upvotes', LongType()),
    StructField('reputation', LongType()),
])

(
    spark.read.format('json')
    .load('/Users/david.vrba/spark-trainings/Apache-Spark-for-Data-Engineers/data/test/untitled.json')
    .withColumn('user_metadata', col('user_metadata').cast(metadata_schema))
).show(n=50)