In [17]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import isnan, when, count, col, lit, trim, avg, ceil
from pyspark.sql.types import StringType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [18]:
spark = SparkSession.builder.appName("ReadWriteVal").getOrCreate()

In [19]:
spark

In [4]:
!jupyter-themer --show color  

3024-day
3024-night
abcdef
ambiance
base16-dark
base16-light
blackboard
cobalt
colorforth
dracula
eclipse
elegant
erlang-dark
icecoder
lesser-dark
liquibyte
material
mbo
mdn-like
midnight
monokai
neat
neo
night
paraiso-dark
paraiso-light
pastel-on-dark
rubyblue
seti
solarized-dark
solarized-light
the-matrix
tomorrow-night-bright
tomorrow-night-eighties
ttcn
twilight
vibrant-ink
xq-dark
xq-light
yeti
zenburn


In [6]:

!jupyter-themer -c seti

Custom jupyter notebook theme created - refresh any open jupyter notebooks to apply theme.


In [20]:
# To check number of cores used

# cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
# cores


1

In [21]:
path = "/home/iron/Documents/1.Learning/2. PYSPARK_BASICS/Read Write Validate Datasets/"
students = spark.read.csv(path+"students.csv", inferSchema=True, header=True)

In [22]:
students.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [23]:
students.limit(5).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [24]:
parquet = spark.read.parquet(path+"users1.parquet")
parquet.limit(4).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 13:25:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 22:34:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 06:39:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 06:06:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,


In [9]:
# reading only limited files, for all files with similar names use wildcard
usrs_1_2 = spark.read.option("basepath", path).parquet(path+"users1.parquet", path+"users2.parquet")
usrs_1_2.limit(2).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 13:25:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 22:34:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,


In [10]:
# for s3 bucket
# bucket = "my_bucket"
# key1 = "partition_test/Table1/CREATED_YEAR=2015/*"
# key2 = "partition_test/Table1/CREATED_YEAR=2016/*"
# key3 = "partition_test/Table1/CREATED_YEAR=2017/*"

# test_df = spark.read.parquet('s3://'+bucket+'/'+key1,\
#                                 's3://'+bucket+'/'+key2,
#                                 's3://'+bucket+'/'+key3)

# test_df.show()



- Validating data

In [25]:
students.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [26]:
students.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [27]:
students.describe()

DataFrame[summary: string, gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string]

In [28]:
students = students.withColumnRenamed("parental level of education","Parental_level_of_education").withColumnRenamed("test preparation course","test_prepartion_course").withColumnRenamed("math score","maths_score").withColumnRenamed("writing score","writing_score").printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- Parental_level_of_education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test_prepartion_course: string (nullable = true)
 |-- maths_score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing_score: integer (nullable = true)

