In [3]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadWriteVal").getOrCreate()
spark

In [4]:
# check how many cores you have

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
cores

1

In [11]:
path = "datasets/"

students = spark.read.csv(path+'students.csv', inferSchema=True, header=True)

In [12]:
students.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [13]:
df_parquet = spark.read.parquet(path+'users1.parquet')

In [14]:
df_parquet.show(4)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+--------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|  country|birthdate|   salary|               title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+--------------------+--------+
|2016-02-03 05:55:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|Indonesia| 3/8/1971| 49756.53|    Internal Auditor|   1E+02|
|2016-02-03 15:04:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |   Canada|1/16/1968|150280.17|       Accountant IV|        |
|2016-02-02 23:09:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|   Russia| 2/1/1960|144972.51| Structural Engineer|        

In [15]:
partitions = spark.read.parquet(path+'users*.parquet')

In [16]:
partitions.show(4)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+--------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|  country|birthdate|   salary|               title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+---------+---------+---------+--------------------+--------+
|2016-02-03 05:55:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|Indonesia| 3/8/1971| 49756.53|    Internal Auditor|   1E+02|
|2016-02-03 15:04:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |   Canada|1/16/1968|150280.17|       Accountant IV|        |
|2016-02-02 23:09:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|   Russia| 2/1/1960|144972.51| Structural Engineer|        

In [17]:
users_1_2 = spark.read.parquet(path+'users1.parquet', path+'users2.parquet')

In [18]:
users_1_2

DataFrame[registration_dttm: timestamp, id: int, first_name: string, last_name: string, email: string, gender: string, ip_address: string, cc: string, country: string, birthdate: string, salary: double, title: string, comments: string]

### Validating Data

In [19]:
students.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [20]:
students.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [21]:
students.describe()

DataFrame[summary: string, gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string]

In [22]:
students.schema['math score'].dataType

IntegerType

In [23]:
students.select("math score", "reading score").summary("count", "min", "max").show()

+-------+----------+-------------+
|summary|math score|reading score|
+-------+----------+-------------+
|  count|      1000|         1000|
|    min|         0|           17|
|    max|       100|          100|
+-------+----------+-------------+



### How to specify data types

In [24]:
from pyspark.sql.types import *

In [25]:
data_schema = [StructField('name', StringType(), True),
              StructField('email', StringType(), True),
              StructField('city', StringType(), True),
              StructField('mac', StringType(), True),
              StructField('timestamp', DateType(), True),
              StructField('creditcard', StringType(), True)]

In [27]:
final_struc = StructType(fields=data_schema)

In [29]:
people = spark.read.json(path+'people.json', schema=final_struc)

In [30]:
people.show(4)

+--------------------+--------------------+---------------+-----------------+----------+-------------------+
|                name|               email|           city|              mac| timestamp|         creditcard|
+--------------------+--------------------+---------------+-----------------+----------+-------------------+
|                null|                null|           null|             null|      null|               null|
|        Keeley Bosco|katlyn@jenkinsmag...|Lake Gladysberg|08:fd:0b:cd:77:f7|2015-04-25|1228-1221-1221-1431|
|         Rubye Jerde|juvenal@johnston....|           null|90:4d:fa:42:63:a2|2015-04-25|1228-1221-1221-1431|
|Miss Darian Breit...|                null|           null|f9:0e:d3:40:cb:e9|2015-04-25|               null|
+--------------------+--------------------+---------------+-----------------+----------+-------------------+
only showing top 4 rows

