In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.pyspark import CPySpark, CRDD, CSparkFrame
from pyspark.sql import Row

In [3]:
spark = CPySpark(session=True, sql=True)
spark.context

Tạo ra dataframe từ rdd

In [4]:
lst = [
    ('John', 10),
    ('Lyna', 9),
    ('Samathan', 8),
    ('Tony', 10)
]

rdd = spark.rdd(data=lst)
people: CRDD = rdd.map(lambda x: Row(name=x[0], mark=int(x[1])))
people_df: CSparkFrame = spark.dataframe(people)

In [5]:
people_df.getHead(5)

+--------+----+
|    name|mark|
+--------+----+
|    John|  10|
|    Lyna|   9|
|Samathan|   8|
|    Tony|  10|
+--------+----+



Xem schema _(các featute của dataframe và kiểu dữ liệu của chúng)_ của `people_df`

In [6]:
# cách 1
people_df.schema()

root
 |-- name: string (nullable = true)
 |-- mark: long (nullable = true)



In [7]:
# cách 2
people_df.dataframe

DataFrame[name: string, mark: bigint]

Đọc dữ liệu từ file **CSV**

In [8]:
file_name = "hdfs://bigdata.laptrinhpython.net:19000/people.csv" # URL
people1 = spark.read(file_name)

In [9]:
people1.getHead(5)

+---+---------+--------------+------+-------------+
|_c0|person_id|          name|   sex|date of birth|
+---+---------+--------------+------+-------------+
|  0|      100|Penelope Lewis|female|   1990-08-31|
|  1|      101| David Anthony|  male|   1971-10-14|
|  2|      102|     Ida Shipp|female|   1962-05-24|
|  3|      103|  Joanna Moore|female|   2017-03-10|
|  4|      104|Lisandra Ortiz|female|   2020-08-05|
+---+---------+--------------+------+-------------+
only showing top 5 rows



Đọc dữ liệu từ file **JSON**

In [10]:
file_name1 = "hdfs://bigdata.laptrinhpython.net:19000/data.json"
data = spark.read(file_name1, 'json')

In [11]:
data.getHead(5)

+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|        id|            location|sampling_rate|              sensor|    sensordatavalues|          timestamp|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|5810744647|{112.6, FR, 0, 11...|         null|{22349, 1, {14, N...|[{12340422762, 1....|2019-12-20 03:22:01|
|5810744646|{35.2, DE, 0, 107...|         null|{21149, 7, {9, va...|[{12340422760, 9....|2019-12-20 03:22:01|
|5810744645|{51.0, DE, 0, 293...|         null|{5811, 1, {14, No...|[{12340422757, 6....|2019-12-20 03:22:01|
|5810744644|{34.4, BE, 0, 441...|         null|{8765, 1, {14, No...|[{12340422756, 3....|2019-12-20 03:22:01|
|5810744643|{5.0, FR, 0, 1100...|         null|{21693, 1, {14, N...|[{12340422754, 7....|2019-12-20 03:22:01|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
only showi

Đọc dữ liệu `./data/Obesity_data.csv`

In [13]:
file_name2 = './data/Obesity_data.csv'
df = spark.read(file_name2)

In [14]:
df.getHead(5)

+---+------+------+------+----+---+----+----+-----+-----+-----+
| id|gender|height|weight| bmi|age| bmc| bmd|  fat| lean|pcfat|
+---+------+------+------+----+---+----+----+-----+-----+-----+
|  1|     F|   150|    49|21.8| 53|1312|0.88|17802|28600| 37.3|
|  2|     M|   165|    52|19.1| 65|1309|0.84| 8381|40229| 16.8|
|  3|     F|   157|    57|23.1| 64|1230|0.84|19221|36057| 34.0|
|  4|     F|   156|    53|21.8| 56|1171| 0.8|17472|33094| 33.8|
|  5|     M|   160|    51|19.9| 54|1681|0.98| 7336|40621| 14.8|
+---+------+------+------+----+---+----+----+-----+-----+-----+
only showing top 5 rows



Kiểm tra `df` có bao nhiêu dòng

In [15]:
len(df)

1217

Describe cho `df`

In [17]:
df.describe()

Unnamed: 0,summary,id,gender,height,weight,bmi,age,bmc,bmd,fat,lean,pcfat
0,count,1217.0,1217,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0,1217.0
1,mean,614.518488085456,,156.7239112571898,55.14379622021364,22.39539852095314,47.15201314708299,1724.9145439605588,1.0087428101889888,17288.436318816763,35463.1133935908,31.604785903401805
2,stddev,354.4705719473191,,7.977725682041703,9.404988688010084,3.056441944747136,17.27550739904804,363.3490251436472,0.1131224610778956,5214.398664940806,7027.546493084218,7.182861527055848
3,min,1.0,F,136.0,34.0,14.5,13.0,695.0,0.65,4277.0,19136.0,9.2
4,max,1227.0,M,185.0,95.0,37.1,88.0,3040.0,1.35,40825.0,63059.0,48.4
