In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.pyspark import CPySpark, CRDD, CSparkFrame
from pyspark.sql import Row

In [3]:
spark = CPySpark(session=True, sql=True)
spark.context

Tạo ra dataframe từ rdd

In [4]:
lst = [
    ('John', 10),
    ('Lyna', 9),
    ('Samathan', 8),
    ('Tony', 10)
]

rdd = spark.rdd(data=lst)
people: CRDD = rdd.map(lambda x: Row(name=x[0], mark=int(x[1])))
people_df: CSparkFrame = spark.dataframe(people)

In [5]:
people_df.getHead(5)

+--------+----+
|    name|mark|
+--------+----+
|    John|  10|
|    Lyna|   9|
|Samathan|   8|
|    Tony|  10|
+--------+----+



Xem schema _(các featute của dataframe và kiểu dữ liệu của chúng)_ của `people_df`

In [6]:
# cách 1
people_df.schema()

root
 |-- name: string (nullable = true)
 |-- mark: long (nullable = true)



In [7]:
# cách 2
people_df.dataframe

DataFrame[name: string, mark: bigint]

Đọc dữ liệu từ file **CSV**

In [8]:
file_name = "hdfs://bigdata.laptrinhpython.net:19000/people.csv" # URL
people1 = spark.read(file_name)

In [9]:
people1.getHead(5)

+---+---------+--------------+------+-------------+
|_c0|person_id|          name|   sex|date of birth|
+---+---------+--------------+------+-------------+
|  0|      100|Penelope Lewis|female|   1990-08-31|
|  1|      101| David Anthony|  male|   1971-10-14|
|  2|      102|     Ida Shipp|female|   1962-05-24|
|  3|      103|  Joanna Moore|female|   2017-03-10|
|  4|      104|Lisandra Ortiz|female|   2020-08-05|
+---+---------+--------------+------+-------------+
only showing top 5 rows



Đọc dữ liệu từ file **JSON**

In [10]:
file_name1 = "hdfs://bigdata.laptrinhpython.net:19000/data.json"
data = spark.read(file_name1, 'json')

In [11]:
data.getHead(5)

+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|        id|            location|sampling_rate|              sensor|    sensordatavalues|          timestamp|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|5810744647|{112.6, FR, 0, 11...|         null|{22349, 1, {14, N...|[{12340422762, 1....|2019-12-20 03:22:01|
|5810744646|{35.2, DE, 0, 107...|         null|{21149, 7, {9, va...|[{12340422760, 9....|2019-12-20 03:22:01|
|5810744645|{51.0, DE, 0, 293...|         null|{5811, 1, {14, No...|[{12340422757, 6....|2019-12-20 03:22:01|
|5810744644|{34.4, BE, 0, 441...|         null|{8765, 1, {14, No...|[{12340422756, 3....|2019-12-20 03:22:01|
|5810744643|{5.0, FR, 0, 1100...|         null|{21693, 1, {14, N...|[{12340422754, 7....|2019-12-20 03:22:01|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
only showi

Đọc dữ liệu `./data/Obesity_data.csv`

In [12]:
file_name2 = './data/Obesity_data.csv'
df = spark.read(file_name2)

In [13]:
df.getHead(5)

+---+------+------+------+----+---+----+----+-----+-----+-----+
| id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+---+------+------+------+----+---+----+----+-----+-----+-----+
|  1|     F|   150|    49|21.8| 53|1312|0.88|17802|28600| 37.3|
|  2|     M|   165|    52|19.1| 65|1309|0.84| 8381|40229| 16.8|
|  3|     F|   157|    57|23.1| 64|1230|0.84|19221|36057| 34.0|
|  4|     F|   156|    53|21.8| 56|1171| 0.8|17472|33094| 33.8|
|  5|     M|   160|    51|19.9| 54|1681|0.98| 7336|40621| 14.8|
+---+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 5 rows



Kiểm tra `df` có bao nhiêu dòng

In [14]:
len(df)

1217

Describe cho `df`

In [19]:
df.describe()

Unnamed: 0,summary,id,gender,height,weight,bmi,age,bmc,bmd,fat,lean,pcfat
0,count,1217.0,1217,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0
1,mean,614.518488085456,,156.7239112571898,55.14379622021364,22.39539852095314,47.15201314708299,1724.9145439605588,1.0087428101889888,17288.436318816763,35463.1133935908,31.604785903401805
2,stddev,354.4705719473191,,7.977725682041703,9.404988688010084,3.056441944747136,17.27550739904804,363.3490251436472,0.1131224610778956,5214.398664940806,7027.546493084218,7.182861527055848
3,min,1.0,F,136.0,34.0,14.5,13.0,695.0,0.65,4277.0,19136.0,9.2
4,max,1227.0,M,185.0,95.0,37.1,88.0,3040.0,1.35,40825.0,63059.0,48.4


Describe dựa trên cột dc chỉ định

_Describe trên hai feature là `height` và `weight`_

In [17]:
df.describe(['height', 'weight'])

Unnamed: 0,summary,height,weight
0,count,1217.0,1217.0
1,mean,156.7239112571898,55.14379622021364
2,stddev,7.977725682041703,9.404988688010084
3,min,136.0,34.0
4,max,185.0,95.0


Hiển thị dữ liệu thống kê theo crosstab

In [21]:
df.crosstab(['height', 'weight']).iloc[:10, :]

Unnamed: 0,height_weight,34,35,36,37,38,39,40,41,42,...,79,80,82,85,86,88,90,91,93,95
0,138,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,170,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,142,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,153,0,0,0,0,1,1,0,1,2,...,0,0,0,1,0,0,0,0,0,0
4,174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,185,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,157,0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
7,152,0,0,0,0,0,0,2,4,2,...,0,0,0,0,0,0,0,0,0,0
8,164,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,179,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Tạo ra **Sub Dataframe**

In [22]:
df[['id', 'gender', 'height', 'weight']].getHead(3)

+---+------+------+------+
| id|gender|height|weight|
+---+------+------+------+
|  1|     F|   150|    49|
|  2|     M|   165|    52|
|  3|     F|   157|    57|
+---+------+------+------+
only showing top 3 rows



Lấy ra các giá trị unique của các features

In [25]:
df['gender'].unique().getHead(5)

+------+
|gender|
+------+
|     F|
|     M|
+------+



In [26]:
df[['gender', 'weight']].unique().getHead(3)

+------+------+
|gender|weight|
+------+------+
|     M|    70|
|     M|    67|
|     F|    61|
+------+------+
only showing top 3 rows



Sắp xếp dữ liệu dựa trên cột

In [28]:
df().orderBy(df().age.asc()).show(3)

+----+------+------+------+----+---+----+----+-----+-----+-----+
|  id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+----+------+------+------+----+---+----+----+-----+-----+-----+
| 514|     M|   167|    67|24.0| 13|1440|0.78|29264|44366| 39.0|
| 270|     F|   155|    42|17.5| 14|1615|1.04|11493|28607| 27.6|
|1156|     F|   160|    56|21.9| 14|1810|1.05|20941|34178| 36.8|
+----+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 3 rows



Tạo cột mới

In [39]:
df_sub = df[['id', 'gender', 'height', 'weight']]

In [40]:
df_sub = df_sub.withColumn('bmi', (df_sub.weight / (df_sub.height/100) ** 2))

In [41]:
df_sub.show(5)

+---+------+------+------+------------------+
| id|gender|height|weight|               bmi|
+---+------+------+------+------------------+
|  1|     F|   150|    49| 21.77777777777778|
|  2|     M|   165|    52|19.100091827364558|
|  3|     F|   157|    57|23.124670372023203|
|  4|     F|   156|    53|  21.7784352399737|
|  5|     M|   160|    51|19.921874999999996|
+---+------+------+------+------------------+
only showing top 5 rows



Đổi tên cột

In [44]:
df_sub = df_sub.withColumnRenamed('gender', 'sex')

In [45]:
df_sub.show(5)

+---+---+------+------+------------------+
| id|sex|height|weight|               bmi|
+---+---+------+------+------------------+
|  1|  F|   150|    49| 21.77777777777778|
|  2|  M|   165|    52|19.100091827364558|
|  3|  F|   157|    57|23.124670372023203|
|  4|  F|   156|    53|  21.7784352399737|
|  5|  M|   160|    51|19.921874999999996|
+---+---+------+------+------------------+
only showing top 5 rows



Xóa cột

In [46]:
df_sub.columns

['id', 'sex', 'height', 'weight', 'bmi']

In [47]:
df_sub.drop('bmi', 'id').show(5)

+---+------+------+
|sex|height|weight|
+---+------+------+
|  F|   150|    49|
|  M|   165|    52|
|  F|   157|    57|
|  F|   156|    53|
|  M|   160|    51|
+---+------+------+
only showing top 5 rows



> **Lưu ý**: nếu muốn inplace thì nhớ gán

In [49]:
df_sub.drop('bmi', 'id').columns

['sex', 'height', 'weight']

Nhóm dữ liệu

In [51]:
df_sub.groupBy('sex').mean('weight').show(5)

+---+-----------------+
|sex|      avg(weight)|
+---+-----------------+
|  F|52.31090487238979|
|  M|62.02253521126761|
+---+-----------------+



In [52]:
df_sub.groupBy('sex').agg({'weight': 'mean'}).show(5)

+---+-----------------+
|sex|      avg(weight)|
+---+-----------------+
|  F|52.31090487238979|
|  M|62.02253521126761|
+---+-----------------+



In [53]:
df_sub.groupBy('sex').count().show(5)

+---+-----+
|sex|count|
+---+-----+
|  F|  862|
|  M|  355|
+---+-----+



In [54]:
df_sub.groupBy('sex').agg({'weight': 'min', 'height': 'min'}).show(5)

+---+-----------+-----------+
|sex|min(weight)|min(height)|
+---+-----------+-----------+
|  F|         34|        136|
|  M|         38|        146|
+---+-----------+-----------+

