In [4]:
import pyspark
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName("PracticePySpark").getOrCreate()

In [8]:
spark

### Đọc dữ liệu:
  1. spark.read.csv(...)
  2. spark.read.option('header', 'true').csv(...)

### Xem dữ liệu
  1. Xem qua: df.show()
  2. Xem kỹ
    - Dạng dữ liệu: .printSchema()
    - Tổng quát thống kê: .describe()

### Xử lý cột
  1. Thêm cột: .withColumn()
  2. Vứt cột: .drop()
  3. Đổi tên cột: .withColumnRenamed()
  4. Chọn cột: .select(['cot1', 'cot2',...])

### Lọc dữ liệu: .filter()
  1. <, > , <=, >=, ==
  2. hoặc: |
  3. và: &

### Tổng hợp dữ liệu
  1. .mean()
  2. .sum()
  3. .count()
  4. .max()
  5. .min()
  6. .groupBy('nganh').mean()

In [11]:
df_pyspark = spark.read.csv("test1.csv")
df_pyspark.show()
# mat header

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [14]:
df_pyspark = spark.read.option("header", "true").csv("test1.csv")
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [15]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [17]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|                8|             30000|
+-------+------+------------------+-----------------+------------------+



# xử lý cột

In [19]:
df_pyspark = df_pyspark.withColumn("Experience after 2 year", df_pyspark["Experience"] +2)
df_pyspark.show()

+---------+---+----------+------+-----------------------+
|     Name|age|Experience|Salary|Experience after 2 year|
+---------+---+----------+------+-----------------------+
|    Krish| 31|        10| 30000|                   12.0|
|Sudhanshu| 30|         8| 25000|                   10.0|
|    Sunny| 29|         4| 20000|                    6.0|
|     Paul| 24|         3| 20000|                    5.0|
|   Harsha| 21|         1| 15000|                    3.0|
|  Shubham| 23|         2| 18000|                    4.0|
+---------+---+----------+------+-----------------------+



# xóa cột

In [20]:
df_pyspark = df_pyspark.drop("Experience after 2 year")
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



# đổi tên cột

In [21]:
df_pyspark = df_pyspark.withColumnRenamed('age', "Age")
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



# Chọn cột

In [23]:
df_pyspark['Name', 'Age'].show()

+---------+---+
|     Name|Age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
|  Shubham| 23|
+---------+---+



# lọc dữ liệu

In [25]:
df_pyspark.filter(df_pyspark['Salary'] <= 20000).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [26]:
# lọc nhỏ hơn 20000 hoặc lớn hơn 15000
df_pyspark.filter((df_pyspark['Salary'] <= 20000) |
                  (df_pyspark['Salary'] >= 15000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



#tổng hợp dữ liệu

In [34]:
# cách đọc đúng theo kiểu dữ liệu
df = spark.read.csv("test3.csv", header =True, inferSchema = True)
df.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [36]:
df.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [38]:
df.agg({"salary": "mean"}).show()

+-----------+
|avg(salary)|
+-----------+
|     7300.0|
+-----------+

