In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.pyspark import CPySpark
from pyspark.sql import Row, SQLContext

In [3]:
spark = CPySpark(session=True, sql=True)

In [4]:
spark.context

In [6]:
lst = [
    ('John', 10),
    ('Lyna', 9),
    ('Samathan', 8),
    ('Tony', 10)
]
rdd = spark.rdd(data=lst)

In [7]:
people = rdd.map(lambda x: Row(name=x[0], marks=int(x[1])))

In [8]:
schema_people = spark.dataframe(people)

In [9]:
schema_people.schema()

DataFrame[name: string, marks: bigint]

In [10]:
schema_people.getHead(3)

+--------+-----+
|    name|marks|
+--------+-----+
|    John|   10|
|    Lyna|    9|
|Samathan|    8|
+--------+-----+
only showing top 3 rows



In [11]:
file_name = "hdfs://bigdata.laptrinhpython.net:19000/people.csv"

In [12]:
people = spark.read(file_name)

In [13]:
people.getHead()

+---+---------+--------------+------+-------------+
|_c0|person_id|          name|   sex|date of birth|
+---+---------+--------------+------+-------------+
|  0|      100|Penelope Lewis|female|   1990-08-31|
|  1|      101| David Anthony|  male|   1971-10-14|
|  2|      102|     Ida Shipp|female|   1962-05-24|
|  3|      103|  Joanna Moore|female|   2017-03-10|
|  4|      104|Lisandra Ortiz|female|   2020-08-05|
+---+---------+--------------+------+-------------+
only showing top 5 rows



In [18]:
type(people.dataframe)

pyspark.sql.dataframe.DataFrame

In [19]:
file_name = "hdfs://bigdata.laptrinhpython.net:19000/data.json"

In [20]:
data = spark.read(file_name, 'json')

In [None]:
data.getHead()

In [5]:
df = spark.read(r'./data/Obesity_data.csv')

In [7]:
df.schema()

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- bmi: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- bmc: integer (nullable = true)
 |-- bmd: double (nullable = true)
 |-- fat: integer (nullable = true)
 |-- lean: integer (nullable = true)
 |-- pcfat: double (nullable = true)



In [8]:
df.dataframe

DataFrame[id: int, gender: string, height: int, weight: int, bmi: double, age: int, bmc: int, bmd: double, fat: int, lean: int, pcfat: double]

In [9]:
df.getHead()

+---+------+------+------+----+---+----+----+-----+-----+-----+
| id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+---+------+------+------+----+---+----+----+-----+-----+-----+
|  1|     F|   150|    49|21.8| 53|1312|0.88|17802|28600| 37.3|
|  2|     M|   165|    52|19.1| 65|1309|0.84| 8381|40229| 16.8|
|  3|     F|   157|    57|23.1| 64|1230|0.84|19221|36057| 34.0|
|  4|     F|   156|    53|21.8| 56|1171| 0.8|17472|33094| 33.8|
|  5|     M|   160|    51|19.9| 54|1681|0.98| 7336|40621| 14.8|
+---+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 5 rows



In [10]:
len(df)

1217

In [11]:
df.dataframe.describe().show()

+-------+-----------------+------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+
|summary|               id|gender|            height|           weight|               bmi|              age|               bmc|               bmd|               fat|             lean|             pcfat|
+-------+-----------------+------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+
|  count|             1217|  1217|              1217|             1217|              1217|             1217|              1217|              1217|              1217|             1217|              1217|
|   mean| 614.518488085456|  null| 156.7239112571898|55.14379622021364| 22.39539852095314|47.15201314708299|1724.9145439605588|1.0087428101889888|17288.436318816763| 35463.1133935908|31.60

In [12]:
df.dataframe.describe('height', 'weight').show()

+-------+------------------+-----------------+
|summary|            height|           weight|
+-------+------------------+-----------------+
|  count|              1217|             1217|
|   mean| 156.7239112571898|55.14379622021364|
| stddev|7.9777256820417035|9.404988688010084|
|    min|               136|               34|
|    max|               185|               95|
+-------+------------------+-----------------+



In [13]:
df.dataframe.toPandas()

Unnamed: 0,id,gender,height,weight,bmi,age,bmc,bmd,fat,lean,pcfat
0,1,F,150,49,21.8,53,1312,0.88,17802,28600,37.3
1,2,M,165,52,19.1,65,1309,0.84,8381,40229,16.8
2,3,F,157,57,23.1,64,1230,0.84,19221,36057,34.0
3,4,F,156,53,21.8,56,1171,0.80,17472,33094,33.8
4,5,M,160,51,19.9,54,1681,0.98,7336,40621,14.8
...,...,...,...,...,...,...,...,...,...,...,...
1212,1223,F,150,44,19.6,44,1474,0.95,12906,28534,30.1
1213,1224,F,148,51,23.3,58,1522,0.97,14938,33931,29.6
1214,1225,F,149,50,22.5,57,1409,0.93,16777,30598,34.4
1215,1226,F,144,49,23.6,67,1266,0.90,20094,27272,41.3


In [14]:
type(df.dataframe.toPandas())

pandas.core.frame.DataFrame

In [15]:
df_sub = df.dataframe.select('id', 'gender', 'height', 'weight')

In [16]:
df_sub.show(3)

+---+------+------+------+
| id|gender|height|weight|
+---+------+------+------+
|  1|     F|   150|    49|
|  2|     M|   165|    52|
|  3|     F|   157|    57|
+---+------+------+------+
only showing top 3 rows



In [17]:
df_sub.crosstab('height', 'gender').show(10)

+-------------+---+---+
|height_gender|  F|  M|
+-------------+---+---+
|          138|  2|  0|
|          170|  1| 24|
|          142| 11|  0|
|          153| 73|  1|
|          174|  0|  2|
|          185|  0|  1|
|          157| 31|  5|
|          152| 54|  2|
|          164| 11| 17|
|          179|  0|  1|
+-------------+---+---+
only showing top 10 rows



In [18]:
df_sub.crosstab('gender', 'height').show(10)

+-------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|gender_height|136|137|138|139|140|141|142|143|144|145|146|147|148|149|150|151|152|153|154|155|156|157|158|159|160|161|162|163|164|165|166|167|168|169|170|171|172|173|174|175|176|177|178|179|180|182|183|185|
+-------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|            M|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|  3|  0|  6|  1|  2|  1|  3| 11|  6|  5| 14|  8| 33|  9| 23| 17| 17| 37| 13| 17| 30| 11| 24|  7| 12|  8|  2| 11|  4|  2|  8|  1|  4|  2|  1|  1|
|            F|  1|  1|  2|  2|  7|  6| 11| 11| 13| 24| 20| 18| 35| 33| 95| 34| 54| 73| 45|104| 45| 31| 53| 29| 45|  8| 19| 13| 11| 10|  0|  3|  4|  1|  1|  0|  0|  0| 

In [19]:
df_sub.groupby('gender').agg({'weight': 'mean'}).show()

+------+-----------------+
|gender|      avg(weight)|
+------+-----------------+
|     F|52.31090487238979|
|     M|62.02253521126761|
+------+-----------------+



In [22]:
df_sub.groupby('gender').agg({'weight': 'min', 'height': 'min'}).toPandas()

Unnamed: 0,gender,min(weight),min(height)
0,F,34,136
1,M,38,146


In [24]:
df_sub.select('gender').distinct().toPandas()

Unnamed: 0,gender
0,F
1,M


In [26]:
df().orderBy(df().age.desc()).show(3)

+---+------+------+------+----+---+----+----+-----+-----+-----+
| id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+---+------+------+------+----+---+----+----+-----+-----+-----+
|963|     M|   158|    61|24.4| 88|1670|1.02|18038|39526| 30.5|
|712|     M|   161|    47|18.1| 87|1678|1.01|12328|32725| 26.4|
|588|     F|   142|    46|22.8| 85| 875|0.68|13946|31026| 30.4|
+---+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 3 rows



In [28]:
df().orderBy(df().age.asc()).show(3)

+----+------+------+------+----+---+----+----+-----+-----+-----+
|  id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+----+------+------+------+----+---+----+----+-----+-----+-----+
| 514|     M|   167|    67|24.0| 13|1440|0.78|29264|44366| 39.0|
| 270|     F|   155|    42|17.5| 14|1615|1.04|11493|28607| 27.6|
|1156|     F|   160|    56|21.9| 14|1810|1.05|20941|34178| 36.8|
+----+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 3 rows



In [29]:
df_sub.withColumn('bmi', (df_sub.weight/(df_sub.height/100)**2)).show(5)

+---+------+------+------+------------------+
| id|gender|height|weight|               bmi|
+---+------+------+------+------------------+
|  1|     F|   150|    49| 21.77777777777778|
|  2|     M|   165|    52|19.100091827364558|
|  3|     F|   157|    57|23.124670372023203|
|  4|     F|   156|    53|  21.7784352399737|
|  5|     M|   160|    51|19.921874999999996|
+---+------+------+------+------------------+
only showing top 5 rows



In [30]:
df_sub.withColumnRenamed('gender', 'sex').show(3)

+---+---+------+------+
| id|sex|height|weight|
+---+---+------+------+
|  1|  F|   150|    49|
|  2|  M|   165|    52|
|  3|  F|   157|    57|
+---+---+------+------+
only showing top 3 rows



In [31]:
df().columns

['id',
 'gender',
 'height',
 'weight',
 'bmi',
 'age',
 'bmc',
 'bmd',
 'fat',
 'lean',
 'pcfat']

In [32]:
df().drop('bmi', 'lean').columns

['id', 'gender', 'height', 'weight', 'age', 'bmc', 'bmd', 'fat', 'pcfat']

In [34]:
df().count()

1217

In [35]:
df().dropDuplicates().count()

1217

In [36]:
df().dropna().count() # xóa missing value

1217

In [37]:
people = spark.read(r'./data/people.csv')()

In [38]:
people_sub = people.filter(people.name.like('I%p'))

people_sub.count()

7

In [39]:
people_sub.show()

+-----+---------+-------------+------+-------------+
|  _c0|person_id|         name|   sex|date of birth|
+-----+---------+-------------+------+-------------+
|    2|      102|    Ida Shipp|female|   1962-05-24|
| 5791|     5891| Isaias Stepp|  male|   1974-04-13|
| 9965|    10065| Isabel Sharp|female|   1967-11-28|
|58576|    58676|Isadora Tripp|female|   1971-06-05|
|68125|    68225|   Irene Kulp|female|   1986-07-06|
|71572|    71672|    Ivan Ropp|  male|   1975-07-08|
|92061|    92161|   Inez Estep|female|   1925-09-03|
+-----+---------+-------------+------+-------------+



In [40]:
people_na = people.filter(people.name.like('%na%'))

people_na.count()

6075

In [41]:
people_na.show()

+---+---------+-----------------+------+-------------+
|_c0|person_id|             name|   sex|date of birth|
+---+---------+-----------------+------+-------------+
|  3|      103|     Joanna Moore|female|   2017-03-10|
|  8|      108| Leonard Cavender|  male|   1958-08-08|
| 11|      111|Annabelle Rosseau|female|   1989-07-13|
| 14|      114|Alejandro Brennan|  male|   1980-12-22|
| 18|      118|     Tina Gaskins|female|   1966-12-05|
| 24|      124|  Charles Leonard|  male|   1972-03-09|
| 27|      127|       Devona Kay|female|   2009-12-30|
| 69|      169| Leonarda Johnson|female|   2006-06-01|
| 73|      173|      Regina Lamb|female|   1977-07-22|
|110|      210|   Deborah Gerena|female|   2001-11-18|
|126|      226|    Gracie Hannah|female|   1948-08-07|
|130|      230|  Sabrina Metzger|female|   2019-12-21|
|156|      256|Jennifer Mcanally|female|   1981-08-13|
|170|      270|     John Barnard|  male|   2015-07-31|
|197|      297|   Ronald Elliott|  male|   1992-01-14|
|210|     

In [42]:
people_Moore = people.where(people.name.contains('Moore'))

people_Moore.count()

352

In [43]:
people_not_Moore = people.where(~people.name.contains('Moore'))

people_not_Moore.count()

99648

In [44]:
people_not_Moore.show()

+---+---------+-----------------+------+-------------+
|_c0|person_id|             name|   sex|date of birth|
+---+---------+-----------------+------+-------------+
|  0|      100|   Penelope Lewis|female|   1990-08-31|
|  1|      101|    David Anthony|  male|   1971-10-14|
|  2|      102|        Ida Shipp|female|   1962-05-24|
|  4|      104|   Lisandra Ortiz|female|   2020-08-05|
|  5|      105|    David Simmons|  male|   1999-12-30|
|  6|      106|    Edward Hudson|  male|   1983-05-09|
|  7|      107|     Albert Jones|  male|   1990-09-13|
|  8|      108| Leonard Cavender|  male|   1958-08-08|
|  9|      109|   Everett Vadala|  male|   2005-05-24|
| 10|      110| Freddie Claridge|  male|   2002-05-07|
| 11|      111|Annabelle Rosseau|female|   1989-07-13|
| 12|      112|    Eulah Emanuel|female|   1976-01-19|
| 13|      113|       Shaun Love|  male|   1970-05-26|
| 14|      114|Alejandro Brennan|  male|   1980-12-22|
| 15|      115|Robert Mcreynolds|  male|   1973-12-27|
| 16|     

In [45]:
people_1999 = people.filter((people['date of birth'] >= '1999-01-01') & (people['date of birth'] <= '1999-12-31'))

In [46]:
people_1999.count()

1366

In [47]:
people_1999 = people.where((people['date of birth'] >= '1999-01-01') & (people['date of birth'] <= '1999-12-31'))

In [48]:
people_1999.count()

1366

In [56]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

String transformation

In [50]:
people.withColumn('upper_name', upper(col('name'))).show(3)

+---+---------+--------------+------+-------------+--------------+
|_c0|person_id|          name|   sex|date of birth|    upper_name|
+---+---------+--------------+------+-------------+--------------+
|  0|      100|Penelope Lewis|female|   1990-08-31|PENELOPE LEWIS|
|  1|      101| David Anthony|  male|   1971-10-14| DAVID ANTHONY|
|  2|      102|     Ida Shipp|female|   1962-05-24|     IDA SHIPP|
+---+---------+--------------+------+-------------+--------------+
only showing top 3 rows



Casting

In [70]:
people = people.withColumn('year', substring('date of birth', pos=0, len=4).cast(IntegerType()))

In [71]:
people.show(3)

+---+---------+--------------+------+-------------+----+
|_c0|person_id|          name|   sex|date of birth|year|
+---+---------+--------------+------+-------------+----+
|  0|      100|Penelope Lewis|female|   1990-08-31|1990|
|  1|      101| David Anthony|  male|   1971-10-14|1971|
|  2|      102|     Ida Shipp|female|   1962-05-24|1962|
+---+---------+--------------+------+-------------+----+
only showing top 3 rows



In [73]:
people_1989 = people.where(people['year'] >= 1989)

In [74]:
people_1989.count()

39593

In [75]:
people_1989.show(5)

+---+---------+--------------+------+-------------+----+
|_c0|person_id|          name|   sex|date of birth|year|
+---+---------+--------------+------+-------------+----+
|  0|      100|Penelope Lewis|female|   1990-08-31|1990|
|  3|      103|  Joanna Moore|female|   2017-03-10|2017|
|  4|      104|Lisandra Ortiz|female|   2020-08-05|2020|
|  5|      105| David Simmons|  male|   1999-12-30|1999|
|  7|      107|  Albert Jones|  male|   1990-09-13|1990|
+---+---------+--------------+------+-------------+----+
only showing top 5 rows



In [78]:
df = df()

In [79]:
df.show()

+---+------+------+------+----+---+----+----+-----+-----+-----+
| id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+---+------+------+------+----+---+----+----+-----+-----+-----+
|  1|     F|   150|    49|21.8| 53|1312|0.88|17802|28600| 37.3|
|  2|     M|   165|    52|19.1| 65|1309|0.84| 8381|40229| 16.8|
|  3|     F|   157|    57|23.1| 64|1230|0.84|19221|36057| 34.0|
|  4|     F|   156|    53|21.8| 56|1171| 0.8|17472|33094| 33.8|
|  5|     M|   160|    51|19.9| 54|1681|0.98| 7336|40621| 14.8|
|  6|     F|   153|    47|20.1| 52|1358|0.91|14904|30068| 32.2|
|  7|     F|   155|    58|24.1| 66|1546|0.96|20233|35599| 35.3|
|  8|     M|   167|    65|23.3| 50|2276|1.11|17749|43301| 28.0|
|  9|     M|   165|    54|19.8| 61|1778|0.96|10795|38613| 21.1|
| 10|     F|   158|    60|24.0| 58|1404|0.86|21365|35534| 36.6|
| 11|     F|   155|    48|20.0| 36|1889|1.06|13458|32261| 28.3|
| 12|     M|   165|    65|23.9| 50|1878|0.97|18100|43391| 28.6|
| 13|     F|   155|    40|16.6| 78|1001|