In [1]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
url = "https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv"
local_file = "employees.csv"
response = requests.get(url)
with open(local_file, "wb") as f:
    f.write(response.content)
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
df = spark.read.csv(local_file, header=True, inferSchema=True)
df.show(10)

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


In [2]:
df.select("FIRST_NAME").show()

+----------+
|FIRST_NAME|
+----------+
|    Donald|
|   Douglas|
|  Jennifer|
|   Michael|
|       Pat|
|     Susan|
|   Hermann|
|   Shelley|
|   William|
|    Steven|
|     Neena|
|       Lex|
| Alexander|
|     Bruce|
|     David|
|     Valli|
|     Diana|
|     Nancy|
|    Daniel|
|      John|
+----------+
only showing top 20 rows



In [4]:
df.filter(col("SALARY")>15000).show()

+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|17-JUN-03|AD_PRES| 24000|            - |        - |           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|21-SEP-05|  AD_VP| 17000|            - |       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|13-JAN-01|  AD_VP| 17000|            - |       100|           90|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+



In [7]:
df_where = df.where(df.DEPARTMENT_ID == 50).show()

+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|  LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald|   OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|      Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|        120|   Matthew|      Weiss|  MWEISS|650.123.1234|18-JUL-04|  ST_MAN|  8000|            - |       100|           50|
|        121|      Adam|      Fripp|  AFRIPP|650.123.2234|10-APR-05|  ST_MAN|  8200|            - |       100|           50|
|        122|     Payam|   Kaufling|PKAUFLIN|650.123.3234|01-MAY-03|  ST_MAN|  7900|            - |       100|           50|


In [8]:
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


In [9]:
data_list = df.collect()

In [11]:
column_list = df.select("EMAIL").rdd.flatMap(lambda x: x).collect()

In [17]:
print(column_list)

['DOCONNEL', 'DGRANT', 'JWHALEN', 'MHARTSTE', 'PFAY', 'SMAVRIS', 'HBAER', 'SHIGGINS', 'WGIETZ', 'SKING', 'NKOCHHAR', 'LDEHAAN', 'AHUNOLD', 'BERNST', 'DAUSTIN', 'VPATABAL', 'DLORENTZ', 'NGREENBE', 'DFAVIET', 'JCHEN', 'ISCIARRA', 'JMURMAN', 'LPOPP', 'DRAPHEAL', 'AKHOO', 'SBAIDA', 'STOBIAS', 'GHIMURO', 'KCOLMENA', 'MWEISS', 'AFRIPP', 'PKAUFLIN', 'SVOLLMAN', 'KMOURGOS', 'JNAYER', 'IMIKKILI', 'JLANDRY', 'SMARKLE', 'LBISSOT', 'MATKINSO', 'JAMRLOW', 'TJOLSON', 'JMALLIN', 'MROGERS', 'KGEE', 'HPHILTAN', 'RLADWIG', 'SSTILES', 'JSEO', 'JPATEL']


In [18]:
row_count = df.rdd.count()

In [19]:
print(f'The DataFrame has {row_count} rows.')

The DataFrame has 50 rows.


In [22]:
df.withColumn("NewSalary",col("SALARY")* 10).show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+---------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|NewSalary|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+---------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|    26000|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|    26000|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|    44000|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|   130000|
|        202|       Pat|      Fay|    PFAY|603.123.6666

In [23]:
df.drop("NewSalary") \
  .show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|
