In [1]:
from pyspark import SparkConf, SparkContext
import pyspark

conf = SparkConf().setAppName('myApp2').setMaster('local')
sc = SparkContext(conf=conf)


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [4]:
df = spark.read.format('json').load('C:\\Users\\daesi\\Downloads\\2015-summary.json')
df

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [5]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [6]:
df.collect()[:3]

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [7]:
df.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [8]:
df.count()


256

In [10]:
df.select(df['count']).show(5)

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows



In [11]:
df2= df.select('DEST_COUNTRY_NAME')
df2.show()

+--------------------+
|   DEST_COUNTRY_NAME|
+--------------------+
|       United States|
|       United States|
|       United States|
|               Egypt|
|       United States|
|       United States|
|       United States|
|          Costa Rica|
|             Senegal|
|             Moldova|
|       United States|
|       United States|
|              Guyana|
|               Malta|
|            Anguilla|
|             Bolivia|
|       United States|
|             Algeria|
|Turks and Caicos ...|
|       United States|
+--------------------+
only showing top 20 rows



In [19]:
df2= df.select('DEST_COUNTRY_NAME').dropDuplicates()
df2.show()

+--------------------+
|   DEST_COUNTRY_NAME|
+--------------------+
|            Anguilla|
|              Russia|
|            Paraguay|
|             Senegal|
|              Sweden|
|            Kiribati|
|              Guyana|
|         Philippines|
|            Djibouti|
|            Malaysia|
|           Singapore|
|                Fiji|
|              Turkey|
|                Iraq|
|             Germany|
|              Jordan|
|               Palau|
|Turks and Caicos ...|
|              France|
|              Greece|
+--------------------+
only showing top 20 rows



In [14]:
df2.distinct().count()

132

In [15]:
df.show(df.count())

+--------------------+--------------------+------+
|   DEST_COUNTRY_NAME| ORIGIN_COUNTRY_NAME| count|
+--------------------+--------------------+------+
|       United States|             Romania|    15|
|       United States|             Croatia|     1|
|       United States|             Ireland|   344|
|               Egypt|       United States|    15|
|       United States|               India|    62|
|       United States|           Singapore|     1|
|       United States|             Grenada|    62|
|          Costa Rica|       United States|   588|
|             Senegal|       United States|    40|
|             Moldova|       United States|     1|
|       United States|        Sint Maarten|   325|
|       United States|    Marshall Islands|    39|
|              Guyana|       United States|    64|
|               Malta|       United States|     1|
|            Anguilla|       United States|    41|
|             Bolivia|       United States|    30|
|       United States|         

In [17]:
df2.sort('count')

DataFrame[DEST_COUNTRY_NAME: string]

In [18]:
df2.sort('count').show()

+--------------------+
|   DEST_COUNTRY_NAME|
+--------------------+
|       United States|
|              Kosovo|
|              Zambia|
|       United States|
|               Malta|
|       United States|
|            Suriname|
|       United States|
|            Djibouti|
|        Burkina Faso|
|Saint Vincent and...|
|       United States|
|       United States|
|             Moldova|
|              Cyprus|
|       United States|
|       United States|
|       United States|
|       United States|
|       Cote d'Ivoire|
+--------------------+
only showing top 20 rows



In [23]:
from pyspark.sql.functions import expr

# 컬럼 추가

df3 = df.withColumn('withinCountry', expr('DEST_COUNTRY_NAME == ORIGIN_COUNTRY_NAME'))

In [24]:
df3.groupBy('withinCountry').count().show()

+-------------+-----+
|withinCountry|count|
+-------------+-----+
|         true|    1|
|        false|  255|
+-------------+-----+



In [25]:
df

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [27]:
df.where('count<2').show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Croatia|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|             Cyprus|    1|
|        Burkina Faso|      United States|    1|
|            Djibouti|      United States|    1|
|       United States|            Estonia|    1|
|              Zambia|      United States|    1|
|              Cyprus|      United States|    1|
|       United States|          Lithuania|    1|
|       United States|           Bulgaria|    1|
|       United States|            Georgia|    1|
|       United States|            Bahrain|    1|
|       Cote d'Ivoir

In [28]:
df.where('count<2').where('ORIGIN_COUNTRY_NAME != "Croatia"').show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [29]:
df.filter('count<2').show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Croatia|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|             Cyprus|    1|
|        Burkina Faso|      United States|    1|
|            Djibouti|      United States|    1|
|       United States|            Estonia|    1|
|              Zambia|      United States|    1|
|              Cyprus|      United States|    1|
|       United States|          Lithuania|    1|
|       United States|           Bulgaria|    1|
|       United States|            Georgia|    1|
|       United States|            Bahrain|    1|
|       Cote d'Ivoir

In [43]:
emp_df=spark.read.format('csv').option('header','true').option('interSchema','true').load('C:\\Users\\daesi\\Downloads\\emp.csv')
emp_df.printSchema()

root
 |-- empno: string (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: string (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: string (nullable = true)
 |-- comm: string (nullable = true)
 |-- deptno: string (nullable = true)



In [44]:
emp_df.select('*').where('deptno=20').show()

+-----+-----+-------+----+----------+----+----+------+
|empno|ename|    job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+-------+----+----------+----+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800|null|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975|null|    20|
| 7788|SCOTT|ANALYST|7566|1987-04-19|3000|null|    20|
| 7876|ADAMS|  CLERK|7788|1987-05-23|1100|null|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000|null|    20|
+-----+-----+-------+----+----------+----+----+------+



In [45]:
emp_df.selectExpr('count(*)').show()

+--------+
|count(1)|
+--------+
|      15|
+--------+



In [50]:
from pyspark.sql.functions import count

emp_df.select(count('comm')).show()

+-----------+
|count(comm)|
+-----------+
|          4|
+-----------+



In [51]:
emp_df.select('job').distinct().show()

+---------+
|      job|
+---------+
|  ANALYST|
| SALESMAN|
|    CLERK|
|  MANAGER|
|PRESIDENT|
+---------+



In [53]:
from pyspark.sql.functions import min, max

emp_df.select(min('sal'), max('sal')).show()

+--------+--------+
|min(sal)|max(sal)|
+--------+--------+
|    1100|     950|
+--------+--------+



In [54]:
from pyspark.sql.functions import sumDistinct

emp_df.select(sumDistinct('sal')).show()



+-----------------+
|sum(DISTINCT sal)|
+-----------------+
|          27975.0|
+-----------------+



In [55]:
from pyspark.sql.functions import avg

emp_df.select(avg('sal')).show()

+------------------+
|          avg(sal)|
+------------------+
|2148.3333333333335|
+------------------+



In [56]:
emp_df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    5|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+

