In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

# 2015-summary.json

In [3]:
df = spark.read.format('json').load('learning_spark_data/2015-summary.json')

In [5]:
# 데이터가 많지 않아서 count 가능
df.count()

256

In [6]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

In [7]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [9]:
# df.collect()

In [10]:
df.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [11]:
df.select('count').show(5)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows



In [15]:
df1 = df.select('DEST_COUNTRY_NAME').distinct().cache()
df1.count()

132

In [16]:
# ROW class를 이용한 단일 레코드 생성
from pyspark.sql import Row
myRow = Row('hello', None, 1, False)
myRow

<Row('hello', None, 1, False)>

In [23]:
# 새로운 컬럼 추가하기
from pyspark.sql.functions import expr
df3 = df.withColumn('withinCountry', expr('ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME'))   # expr: sql표현식을 받아서 생성
df3

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, withinCountry: boolean]

In [24]:
df3.show(3)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
|    United States|            Ireland|  344|        false|
+-----------------+-------------------+-----+-------------+
only showing top 3 rows



In [25]:
df3.filter(df3.withinCountry).show(5)

+-----------------+-------------------+------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|withinCountry|
+-----------------+-------------------+------+-------------+
|    United States|      United States|370002|         true|
+-----------------+-------------------+------+-------------+



case when count < 10 'under', 10 >= count 'upper' => category 컬럼 추가

In [33]:
df4 = df.withColumn('category', expr("CASE WHEN count < 10 THEN 'under' WHEN count >= 10 THEN 'upper' END"))
df4

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, category: string]

In [35]:
df4.show(5)

+-----------------+-------------------+-----+--------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|category|
+-----------------+-------------------+-----+--------+
|    United States|            Romania|   15|   upper|
|    United States|            Croatia|    1|   under|
|    United States|            Ireland|  344|   upper|
|            Egypt|      United States|   15|   upper|
|    United States|              India|   62|   upper|
+-----------------+-------------------+-----+--------+
only showing top 5 rows



In [36]:
# DataFrame의 select(), where(), filter() => 트랜스포메이션
# show(), count() => 액션

In [37]:
spark.stop()

# 집계함수

In [39]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [49]:
# emp_df, dept_df
emp_df = spark.read.format('csv').option("header", "true").load("learning_spark_data/emp.csv")
dept_df = spark.read.format('csv').option("header", "true").load("learning_spark_data/dept.csv")

In [50]:
emp_df.show(3)

+-----+-----+--------+----+----------+----+----+------+
|empno|ename|     job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+--------+----+----------+----+----+------+
| 7369|SMITH|   CLERK|7902|1980-12-17| 800|NULL|    20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521| WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|
+-----+-----+--------+----+----------+----+----+------+
only showing top 3 rows



In [51]:
dept_df.show(3)

+------+----------+--------+
|deptno|     dname|     loc|
+------+----------+--------+
|    10|ACCOUNTING|NEW YORK|
|    20|  RESEARCH|  DALLAS|
|    30|     SALES| CHICAGO|
+------+----------+--------+
only showing top 3 rows



In [52]:
emp_df.count()

15

In [53]:
dept_df.count()

4

In [56]:
# 컬럼명을 대소문자 구분x
emp_df.select(['ENAME', 'DEPTNO']).show()

+------+------+
| ENAME|DEPTNO|
+------+------+
| SMITH|    20|
| ALLEN|    30|
|  WARD|    30|
| JONES|    20|
|MARTIN|    30|
| BLAKE|    30|
| CLARK|    10|
| SCOTT|    20|
|  KING|    10|
|TURNER|    30|
| ADAMS|    20|
| JAMES|    30|
|  FORD|    20|
|MILLER|    10|
|  JACK|    70|
+------+------+



In [58]:
# filter()랑 동일함!
emp_df.select('*').where('DEPTNO=20').show()

+-----+-----+-------+----+----------+----+----+------+
|empno|ename|    job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+-------+----+----------+----+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800|NULL|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7788|SCOTT|ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7876|ADAMS|  CLERK|7788|1987-05-23|1100|NULL|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000|NULL|    20|
+-----+-----+-------+----+----------+----+----+------+



In [65]:
# 계산식일때는 selectExpr
emp_df.selectExpr('count(*)').show()

+--------+
|count(1)|
+--------+
|      15|
+--------+



In [64]:
from pyspark.sql.functions import countDistinct
emp_df.select(countDistinct('job')).show()

+-------------------+
|count(DISTINCT job)|
+-------------------+
|                  5|
+-------------------+



In [67]:
from pyspark.sql.functions import approx_count_distinct
emp_df.select(approx_count_distinct('job', 0.1)).show()  # 오류율 지정 (데이터가 많을 때)

+--------------------------+
|approx_count_distinct(job)|
+--------------------------+
|                         5|
+--------------------------+



In [127]:
# first, last, min, max, sum, avg -> (expr: sql문장 x, function으로 처리)
from pyspark.sql.functions import first, last, min, max, sum, avg, col, mean, count

In [79]:
emp_df.select(first('sal')).show()

+----------+
|first(sal)|
+----------+
|       800|
+----------+



In [137]:
emp_df.select(last('sal')).show()

+---------+
|last(sal)|
+---------+
|     3200|
+---------+



In [138]:
emp_df = emp_df.withColumn('sal', col('sal').cast('int'))

In [140]:
emp_df.select(min('sal')).show()

+--------+
|min(sal)|
+--------+
|     800|
+--------+



In [141]:
emp_df.select(max('sal')).show()

+--------+
|max(sal)|
+--------+
|    5000|
+--------+



In [142]:
emp_df.select(sum('sal')).show()

+--------+
|sum(sal)|
+--------+
|   32225|
+--------+



In [143]:
emp_df.select(avg('sal')).show()

+------------------+
|          avg(sal)|
+------------------+
|2148.3333333333335|
+------------------+



In [145]:
# total_salary / total_transaction, avg_salary, mean_salary
emp_df.select(
    sum('sal').alias('total_salary'),
    count('*').alias('total_transaction'),
    avg('sal').alias('avg_salary'),
    mean('sal').alias('mean_salary')
).selectExpr(
    'total_salary/total_transaction',
    'avg_salary',
    'mean_salary'
).show()

+----------------------------------+------------------+------------------+
|(total_salary / total_transaction)|        avg_salary|       mean_salary|
+----------------------------------+------------------+------------------+
|                2148.3333333333335|2148.3333333333335|2148.3333333333335|
+----------------------------------+------------------+------------------+



In [115]:
# 그룹화
emp_df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    5|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+



In [119]:
# select job,
#        count(job),
#        sum(sal)
# groupby job

group_df = emp_df.groupBy('job').agg(
    count('job').alias('qty'),
    expr('count(job)'),
    sum('sal')
)

group_df.show()

+---------+---+----------+--------+
|      job|qty|count(job)|sum(sal)|
+---------+---+----------+--------+
|  ANALYST|  2|         2|  6000.0|
| SALESMAN|  4|         4|  5600.0|
|    CLERK|  5|         5|  7350.0|
|  MANAGER|  3|         3|  8275.0|
|PRESIDENT|  1|         1|  5000.0|
+---------+---+----------+--------+



In [129]:
# sal의 평균, 표준편치를 job별로 계산해서 출력 (소수점 2자리)
# SAL_AVG, SAL_STDEV
from pyspark.sql.functions import avg, stddev, round

emp_df.groupBy('job').agg(
    round(avg('sal'), 2).alias('SAL_AVG'),
    round(stddev('sal'), 2).alias('SAL_STDEV')
).show()

+---------+-------+---------+
|      job|SAL_AVG|SAL_STDEV|
+---------+-------+---------+
|  ANALYST| 3000.0|      0.0|
| SALESMAN| 1400.0|   177.95|
|    CLERK| 1470.0|   984.63|
|  MANAGER|2758.33|   274.24|
|PRESIDENT| 5000.0|     NULL|
+---------+-------+---------+



In [146]:
# 급여 top10 구하기
from pyspark.sql.functions import desc
emp_df.select('sal').orderBy(desc('sal')).limit(10).show()

+----+
| sal|
+----+
|5000|
|3200|
|3000|
|3000|
|2975|
|2850|
|2450|
|1600|
|1500|
|1300|
+----+



In [148]:
# 윈도우 함수
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
windowspec = Window.orderBy(desc('sal'))
salAllRank = rank().over(windowspec)
salAllRank

Column<'RANK() OVER (ORDER BY sal DESC NULLS LAST unspecifiedframe$())'>

In [160]:
emp_df.withColumn('salary_rank', salAllRank).show(5)

+-----+-----+---------+----+----------+----+----+------+-----------+
|empno|ename|      job| mgr|  hiredate| sal|comm|deptno|salary_rank|
+-----+-----+---------+----+----------+----+----+------+-----------+
| 7839| KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|          1|
| 9292| JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|          2|
| 7788|SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|          3|
| 7902| FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|          3|
| 7566|JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|          5|
+-----+-----+---------+----+----------+----+----+------+-----------+
only showing top 5 rows



In [151]:
# 직무별로 rank작성
# Window.partitionBy()
# job_rank_df 작성

windowspec1 = Window.partitionBy('job').orderBy(desc('sal'))
job_rank_df = emp_df.withColumn('job_salary_rank', rank().over(windowspec1))

In [158]:
job_rank_df.show(5)

+-----+------+-------+----+----------+----+----+------+---------------+
|empno| ename|    job| mgr|  hiredate| sal|comm|deptno|job_salary_rank|
+-----+------+-------+----+----------+----+----+------+---------------+
| 7788| SCOTT|ANALYST|7566|1987-04-19|3000|NULL|    20|              1|
| 7902|  FORD|ANALYST|7566|1981-12-03|3000|NULL|    20|              1|
| 9292|  JACK|  CLERK|7782|1982-01-23|3200|NULL|    70|              1|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300|NULL|    10|              2|
| 7876| ADAMS|  CLERK|7788|1987-05-23|1100|NULL|    20|              3|
+-----+------+-------+----+----------+----+----+------+---------------+
only showing top 5 rows



In [159]:
# 부서별 순위
dept_window_spec = Window.partitionBy('deptno').orderBy(desc('sal'))
dept_rank_df = emp_df.withColumn('dept_salary_rank', rank().over(dept_window_spec))
dept_rank_df.show(5)

+-----+------+---------+----+----------+----+----+------+----------------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|dept_salary_rank|
+-----+------+---------+----+----------+----+----+------+----------------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|               1|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|               2|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|               3|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|               1|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|               1|
+-----+------+---------+----+----------+----+----+------+----------------+
only showing top 5 rows



In [171]:
# 누적급여 sum('sal').over()
window_sum = Window.orderBy(desc('empno'))
sum_df = emp_df.withColumn('sal_sum', sum('sal').over(window_sum))
sum_df.show(5)

+-----+------+-------+----+----------+----+----+------+-------+
|empno| ename|    job| mgr|  hiredate| sal|comm|deptno|sal_sum|
+-----+------+-------+----+----------+----+----+------+-------+
| 9292|  JACK|  CLERK|7782|1982-01-23|3200|NULL|    70|   3200|
| 7934|MILLER|  CLERK|7782|1982-01-23|1300|NULL|    10|   4500|
| 7902|  FORD|ANALYST|7566|1981-12-03|3000|NULL|    20|   7500|
| 7900| JAMES|  CLERK|7698|1981-12-03| 950|NULL|    30|   8450|
| 7876| ADAMS|  CLERK|7788|1987-05-23|1100|NULL|    20|   9550|
+-----+------+-------+----+----------+----+----+------+-------+
only showing top 5 rows



In [167]:
# 부서별 누적급여
window_spec_sum = Window.partitionBy('deptno').orderBy(desc('empno'))
dept_sum_df = emp_df.withColumn('dept_sal_sum', sum('sal').over(window_spec_sum))
dept_sum_df.show(5)

+-----+------+---------+----+----------+----+----+------+------------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|dept_sal_sum|
+-----+------+---------+----+----------+----+----+------+------------+
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|        1300|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|        6300|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|        8750|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|        3000|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|        4100|
+-----+------+---------+----+----------+----+----+------+------------+
only showing top 5 rows



In [173]:
# 부서별 평균 급여와 직원 개별 급여 비교
window_spec_avg = Window.partitionBy('deptno')
dept_avg_df = emp_df.withColumn('dept_avg', avg('sal').over(window_spec_avg))
dept_avg_df.show(5)

+-----+------+---------+----+----------+----+----+------+------------------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|          dept_avg|
+-----+------+---------+----+----------+----+----+------+------------------+
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|2916.6666666666665|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|2916.6666666666665|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|2916.6666666666665|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|            2175.0|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|            2175.0|
+-----+------+---------+----+----------+----+----+------+------------------+
only showing top 5 rows



In [180]:
# 부서별, 직업별 소계
emp_df.groupBy('deptno', 'job') \
      .agg(count('*'), sum('sal')) \
      .orderBy('deptno', 'job') \
      .show()    # groupBy는 spark에 부하를 많이 줌

+------+---------+--------+--------+
|deptno|      job|count(1)|sum(sal)|
+------+---------+--------+--------+
|    10|    CLERK|       1|    1300|
|    10|  MANAGER|       1|    2450|
|    10|PRESIDENT|       1|    5000|
|    20|  ANALYST|       2|    6000|
|    20|    CLERK|       2|    1900|
|    20|  MANAGER|       1|    2975|
|    30|    CLERK|       1|     950|
|    30|  MANAGER|       1|    2850|
|    30| SALESMAN|       4|    5600|
|    70|    CLERK|       1|    3200|
+------+---------+--------+--------+



In [181]:
emp_df.cube('deptno', 'job').agg(count('*'), sum('sal'))\
    .orderBy('deptno', 'job').show()

+------+---------+--------+--------+
|deptno|      job|count(1)|sum(sal)|
+------+---------+--------+--------+
|  NULL|     NULL|      15|   32225|
|  NULL|  ANALYST|       2|    6000|
|  NULL|    CLERK|       5|    7350|
|  NULL|  MANAGER|       3|    8275|
|  NULL|PRESIDENT|       1|    5000|
|  NULL| SALESMAN|       4|    5600|
|    10|     NULL|       3|    8750|
|    10|    CLERK|       1|    1300|
|    10|  MANAGER|       1|    2450|
|    10|PRESIDENT|       1|    5000|
|    20|     NULL|       5|   10875|
|    20|  ANALYST|       2|    6000|
|    20|    CLERK|       2|    1900|
|    20|  MANAGER|       1|    2975|
|    30|     NULL|       6|    9400|
|    30|    CLERK|       1|     950|
|    30|  MANAGER|       1|    2850|
|    30| SALESMAN|       4|    5600|
|    70|     NULL|       1|    3200|
|    70|    CLERK|       1|    3200|
+------+---------+--------+--------+



In [182]:
# 평균급여, 최대급여, 최소급여
emp_df.cube('job').agg(avg('sal'), max('sal'), min('sal'))\
    .orderBy('job').show()

+---------+------------------+--------+--------+
|      job|          avg(sal)|max(sal)|min(sal)|
+---------+------------------+--------+--------+
|     NULL|2148.3333333333335|    5000|     800|
|  ANALYST|            3000.0|    3000|    3000|
|    CLERK|            1470.0|    3200|     800|
|  MANAGER|2758.3333333333335|    2975|    2450|
|PRESIDENT|            5000.0|    5000|    5000|
| SALESMAN|            1400.0|    1600|    1250|
+---------+------------------+--------+--------+



# join

In [183]:
# 직원, 부서 조인
emp_dept_df = emp_df.join(dept_df, emp_df['deptno']==dept_df['deptno'])
emp_dept_df.show()

+-----+------+---------+----+----------+----+----+------+------+----------+--------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|deptno|     dname|     loc|
+-----+------+---------+----+----------+----+----+------+------+----------+--------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|    30|     SALES| CHICAGO|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|    30|     SALES| CHICAGO|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|    30|     SALES| CHICAGO|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|    30|     SALES| CHICAGO|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|    10|ACCOUNTING|NEW YORK|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|    10|A

In [185]:
join_df = emp_df.join(dept_df, on='deptno', how='inner')
join_df.select('ename', 'deptno', 'dname').show()

+------+------+----------+
| ename|deptno|     dname|
+------+------+----------+
| SMITH|    20|  RESEARCH|
| ALLEN|    30|     SALES|
|  WARD|    30|     SALES|
| JONES|    20|  RESEARCH|
|MARTIN|    30|     SALES|
| BLAKE|    30|     SALES|
| CLARK|    10|ACCOUNTING|
| SCOTT|    20|  RESEARCH|
|  KING|    10|ACCOUNTING|
|TURNER|    30|     SALES|
| ADAMS|    20|  RESEARCH|
| JAMES|    30|     SALES|
|  FORD|    20|  RESEARCH|
|MILLER|    10|ACCOUNTING|
+------+------+----------+



In [186]:
spark.stop()

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("YellowTaxiTrips").getOrCreate()

In [5]:
trips_df = spark.read.option("header", "true").csv("./learning_spark_data/trips/")
trips_df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       1| 2021-06-01 00:13:26|  2021-06-01 00:17:14|              1|          .90|         1|                 N|         186|          50|           1|          5|    3|    0.5|       2.2|           0|                  0.3

In [4]:
zone_df = spark.read.option("header", "true").csv("./learning_spark_data/taxi+_zone_lookup.csv")
zone_df.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [6]:
spark.stop()