In [1]:
sc

In [2]:
spark

In [3]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")
people_df.head()

Row(city='Mulyosari', country='Indonesia', first_name='Valma', gender='Female', id=1, last_name='Sans', salary=983107)

In [4]:
people_df.show()

+------------------+------------+----------+------+---+---------+------+
|              city|     country|first_name|gender| id|last_name|salary|
+------------------+------------+----------+------+---+---------+------+
|         Mulyosari|   Indonesia|     Valma|Female|  1|     Sans|983107|
|           Niihama|       Japan|     Paolo|  Male|  2|   Kiddie|649173|
|         Dū Qal‘ah| Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|            Iberia|        Peru|    Jarrid|  Male|  4| Dalziell|170398|
|          La Ronge|      Canada| Reinaldos|  Male|  5|   Keeffe|440989|
|      Kuala Lumpur|    Malaysia|        Eb|  Male|  6|Schwanden|274126|
|         Al Qurayn|Saudi Arabia|    Alleyn|  Male|  7|   Paddon|681914|
|           Jixiang|       China|   Baryram|  Male|  8|     Yell|250748|
|Thị Trấn Phong Thổ|     Vietnam|     Cammy|Female|  9|     Axel|221750|
|        Kotatengah|   Indonesia|       Erl|  Male| 10|  Caldera|680801|
|        Roldanillo|    Colombia|    Miguel|  Male|

In [5]:
bank_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine=True)
bank_data.head()

Row(age=58, balance=2143, campaign=1, contact='unknown', day=5, default='no', duration=261, education='tertiary', housing='yes', job='management', loan='no', marital='married', month='may', pdays=-1, poutcome='unknown', previous=0, y='no')

In [7]:
bank_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [8]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



#### 1. Create a schema with defined fields in desired order and desired DType.

In [11]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, StructType, StructField, StringType, FloatType
from pyspark.sql.types import *

In [24]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", LongType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
])

In [25]:
people_df = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [26]:
people_df.show()

+---+----------+---------+------+------+------------------+------------+
| id|first_name|last_name|gender|salary|              city|     country|
+---+----------+---------+------+------+------------------+------------+
|  1|     Valma|     Sans|Female|983107|         Mulyosari|   Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|           Niihama|       Japan|
|  3|    Miltie| De Zuani|  Male|352898|         Dū Qal‘ah| Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|            Iberia|        Peru|
|  5| Reinaldos|   Keeffe|  Male|440989|          La Ronge|      Canada|
|  6|        Eb|Schwanden|  Male|274126|      Kuala Lumpur|    Malaysia|
|  7|    Alleyn|   Paddon|  Male|681914|         Al Qurayn|Saudi Arabia|
|  8|   Baryram|     Yell|  Male|250748|           Jixiang|       China|
|  9|     Cammy|     Axel|Female|221750|Thị Trấn Phong Thổ|     Vietnam|
| 10|       Erl|  Caldera|  Male|680801|        Kotatengah|   Indonesia|
| 11|    Miguel|   Moules|  Male|819771|        Rol

In [27]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



#### 2. withColumn()

In [28]:
# here with column will create new column name "new_salary"
people_df = people_df.withColumn('new_salary', people_df['salary'] + people_df['salary'] * 0.1)

In [29]:
people_df.show()

+---+----------+---------+------+------+------------------+------------+----------+
| id|first_name|last_name|gender|salary|              city|     country|new_salary|
+---+----------+---------+------+------+------------------+------------+----------+
|  1|     Valma|     Sans|Female|983107|         Mulyosari|   Indonesia| 1081417.7|
|  2|     Paolo|   Kiddie|  Male|649173|           Niihama|       Japan|  714090.3|
|  3|    Miltie| De Zuani|  Male|352898|         Dū Qal‘ah| Afghanistan|  388187.8|
|  4|    Jarrid| Dalziell|  Male|170398|            Iberia|        Peru|  187437.8|
|  5| Reinaldos|   Keeffe|  Male|440989|          La Ronge|      Canada|  485087.9|
|  6|        Eb|Schwanden|  Male|274126|      Kuala Lumpur|    Malaysia|  301538.6|
|  7|    Alleyn|   Paddon|  Male|681914|         Al Qurayn|Saudi Arabia|  750105.4|
|  8|   Baryram|     Yell|  Male|250748|           Jixiang|       China|  275822.8|
|  9|     Cammy|     Axel|Female|221750|Thị Trấn Phong Thổ|     Vietnam|  24

In [30]:
# Limit - To Limit first 5 records
people_df.limit(5).show()

+---+----------+---------+------+------+---------+-----------+----------+
| id|first_name|last_name|gender|salary|     city|    country|new_salary|
+---+----------+---------+------+------+---------+-----------+----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia| 1081417.7|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|  714090.3|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|  388187.8|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|  187437.8|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|  485087.9|
+---+----------+---------+------+------+---------+-----------+----------+



#### 3. Random Sampling of Data from Sample Method

In [37]:
people_df.sample(withReplacement=False, fraction=0.1, seed = 123).show()

+---+----------+---------+------+------+---------------+--------------+----------+
| id|first_name|last_name|gender|salary|           city|       country|new_salary|
+---+----------+---------+------+------+---------------+--------------+----------+
| 14|     Diana|  Lawfull|Female|397683|        Ciangir|     Indonesia|  437451.3|
| 35|  Germayne|  Tremeer|  Male|920086|     eSikhawini|  South Africa| 1012094.6|
| 47|   Olivier|   Shewon|  Male|244655|         Kolaka|     Indonesia|  269120.5|
| 61|      Farr|  Dymocke|  Male|616185|      Lenningen|    Luxembourg|  677803.5|
| 69|Ermentrude|  Teffrey|Female| 96039|          Tiron|     Indonesia|  105642.9|
| 87|  Maryanna|  Petchey|Female|853991|       Subotica|        Serbia|  939390.1|
| 89|     Jayme| Brunotti|Female|881570|        Marston|United Kingdom|  969727.0|
|101|    Fredia| Reinhard|Female|644592|Belén de Umbría|      Colombia|  709051.2|
|108|     Meier|  Lavalle|  Male|635366|        Midrand|  South Africa|  698902.6|
|138

#### OrderBy()
    * Arrange data in ascending or descending Order.

In [38]:
people_df.orderBy(['salary', 'first_name'], ascending = True).show()

+---+----------+----------+------+------+------------+--------------------+----------+
| id|first_name| last_name|gender|salary|        city|             country|new_salary|
+---+----------+----------+------+------+------------+--------------------+----------+
| 93|      Cory|     Prigg|  Male| 12876|     Gondang|           Indonesia|   14163.6|
|590|      Flem|  Tumielli|  Male| 13347| Debre Zeyit|            Ethiopia|   14681.7|
|192|       Odo|   Conyers|  Male| 15555|  Raffingora|            Zimbabwe|   17110.5|
|407|  Barbabas| Ballingal|  Male| 18598|Beringinjaya|           Indonesia|   20457.8|
|297|     Daron|    Melato|Female| 19881|      Phayao|            Thailand|   21869.1|
| 24|   Avigdor|   Goddman|  Male| 20216|       Gujun|               China|   22237.6|
|315|    Alayne|    Foskin|Female| 20390|     Siluman|           Indonesia|   22429.0|
|199|     Niles| Atcherley|  Male| 22529|Nova Venécia|              Brazil|   24781.9|
|294|     Terri|    Holton|Female| 23934|  

In [40]:
people_df.orderBy(['first_name', 'salary'], ascending = False).show()

+---+----------+-------------+------+------+--------------------+-----------+----------+
| id|first_name|    last_name|gender|salary|                city|    country|new_salary|
+---+----------+-------------+------+------+--------------------+-----------+----------+
| 99|    Zondra|      Gisburn|Female|698909|       Крива Паланка|  Macedonia|  768799.9|
|572|      Zerk|        Bohin|  Male|853249|           Purwodadi|  Indonesia|  938573.9|
|874| Zackariah|     Trebbett|  Male|643036|          Las Palmas|     Mexico|  707339.6|
|620|  Zacharie|Grzegorzewski|  Male|934644|Santa Cruz de Guacas|  Venezuela| 1028108.4|
|879|  Zaccaria|       Leagas|  Male|695307|             Xam Nua|       Laos|  764837.7|
|448|      Yuri|      Duggary|  Male|414107|       Sang-e Māshah|Afghanistan|  455517.7|
|310|      Yuma|      Brogden|  Male|363880|              Ottawa|     Canada|  400268.0|
|911|  Yovonnda|        Adran|Female|482705|        Buturlinovka|     Russia|  530975.5|
|202|      Yard|     