## Spark read csv to DataFrame and processing

#### Example of using spark.read.csv()

In [406]:
df = spark.read.csv("/Users/binggangliu/Downloads/WaterSites_info.csv")

In [407]:
df.show(6)

+--------------------+-----------------+----+---------------+-------+------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|                 _c0|              _c1| _c2|            _c3|    _c4|   _c5|       _c6|        _c7|                 _c8|                 _c9|                _c10|                _c11|                _c12|                _c13|                _c14|                _c15|                _c16|                _c17|                _c18|                _c19|                _c20|                _c21|                _c22|                _c23|                _c24|                _c25| _c26|
+--------------------+----------

In [408]:
#from pyspark.sql.types import *
df = spark.read.csv("/Users/binggangliu/Downloads/WaterSites_info.csv", inferSchema = True, header = True)
df.show(6)

+--------------------+-----------------+-----+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|         Data Export|              _c1|  _c2|            _c3|          _c4|          _c5|       _c6|        _c7|                 _c8|                 _c9|                _c10|                _c11|                _c12|                _c13|                _c14|                _c15|                _c16|                _c17|                _c18|                _c19|                _c20|                _c21|                _c22|                _c23|                _c24|                _c25| _c26|
+---

##### inferSchema = True, will let Spark automatically infer the data type for each column when reading the csv file;  header = True, will let Spark set the first line of file to be used to name the columns and not include it in the data. (For the case in this example, it does not seem to matter with these two options being set as True)

In [409]:
df.printSchema()

root
 |-- Data Export: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)
 |-- _c24: string (nullable = true)
 |-- _c25: string (nullable = true)
 |-- _c26: string (nullable = true)



#####  There are a few rows having null values in all columns that need to be skipped. Below first try to generate a column with unique IDs, then find out the rows that need to be filtered out and their cooresponding IDs.

In [410]:
from pyspark.sql.functions import monotonically_increasing_id
#df.withColumn("Index", monotonically_increasing_id().alias('id')).filter(id > 2).drop("Index")
df.select(monotonically_increasing_id().alias('id')).show(10)
df.select('_c4').show(10)

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+
only showing top 10 rows

+-------------+
|          _c4|
+-------------+
|         null|
|         null|
|         null|
|         null|
|      Country|
|United States|
|United States|
|United States|
|        China|
|      Germany|
+-------------+
only showing top 10 rows



#####  It looks there are 4 empty rows that need to be filtered out. Below, add a new ID column using withColumn() then filter out the empty rows using the ID column.

In [433]:
df_id = df.withColumn('ID', monotonically_increasing_id().alias('ID'))

In [434]:
df_id_filter = df_id.filter(df_id.ID > 3)

In [435]:
df_id_filter.show(4)

+--------------------+-----------------+-------+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+---+
|         Data Export|              _c1|    _c2|            _c3|          _c4|          _c5|       _c6|        _c7|                 _c8|                 _c9|                _c10|                _c11|                _c12|                _c13|                _c14|                _c15|                _c16|                _c17|                _c18|                _c19|                _c20|                _c21|                _c22|                _c23|                _c24|                _c25| _c

#####  Now it is time to remove the columns that are not useful

In [436]:
df_id_filter.columns

['Data Export',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14',
 '_c15',
 '_c16',
 '_c17',
 '_c18',
 '_c19',
 '_c20',
 '_c21',
 '_c22',
 '_c23',
 '_c24',
 '_c25',
 '_c26',
 'ID']

In [437]:
len(df_id_filter.columns)

28

##### Drop the unnecessary columns.

In [438]:
dfd = df_id_filter.drop('Data Export', '_c1', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c19', '_c20', '_c21', '_c22', '_c23', '_c24','_c25', '_c26')

In [440]:
dfd.show(6)

+--------+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+---+
|     _c2|            _c3|          _c4|          _c5|       _c6|        _c7|                 _c8|                 _c9|                _c10|                _c11| ID|
+--------+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+---+
|    City|State/ Province|      Country|       Region|Latitude 1|Longitude 1|8A Delivered Sour...|8B Delivered Sour...|8B Surface and Ra...|8B Ground Water S...|  4|
|   Blair|             NE|United States|NORTH AMERICA|        42|        -96|                   0|             688,905|                   0|                   0|  5|
|   Blair|             NE|United States|NORTH AMERICA|        42|        -96|                   0|           9,488,173|                   0|                   0|  6|
| Me

##### Rename the automatated generated header columns

In [441]:
dff = dfd.withColumnRenamed('_c2', 'City').withColumnRenamed('_c3', 'State').withColumnRenamed('_c4', 'Country').withColumnRenamed('_c5', 'Region').withColumnRenamed('_c6', 'Latitude').withColumnRenamed('_c7', 'Longitude')

In [442]:
dff.show(5)

+--------+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+---+
|    City|          State|      Country|       Region|  Latitude|  Longitude|                 _c8|                 _c9|                _c10|                _c11| ID|
+--------+---------------+-------------+-------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+---+
|    City|State/ Province|      Country|       Region|Latitude 1|Longitude 1|8A Delivered Sour...|8B Delivered Sour...|8B Surface and Ra...|8B Ground Water S...|  4|
|   Blair|             NE|United States|NORTH AMERICA|        42|        -96|                   0|             688,905|                   0|                   0|  5|
|   Blair|             NE|United States|NORTH AMERICA|        42|        -96|                   0|           9,488,173|                   0|                   0|  6|
| Me

In [443]:
dff.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- ID: long (nullable = false)



#### Remove the commas in the number strings for columns '_c8' to '_c11'

In [444]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import IntegerType, FloatType

dff = dff.withColumn('_c8', regexp_replace('_c8', ',', ''))
dff = dff.withColumn('_c9', regexp_replace('_c9', ',', ''))
dff = dff.withColumn('_c10', regexp_replace('_c10', ',', ''))
dff = dff.withColumn('_c11', regexp_replace('_c11', ',', ''))

In [445]:
dff.select('_c8', '_c9', '_c10', '_c11').show(8)

+--------------------+--------------------+--------------------+--------------------+
|                 _c8|                 _c9|                _c10|                _c11|
+--------------------+--------------------+--------------------+--------------------+
|8A Delivered Sour...|8B Delivered Sour...|8B Surface and Ra...|8B Ground Water S...|
|                   0|              688905|                   0|                   0|
|                   0|             9488173|                   0|                   0|
|                   0|             7037363|                   0|                   0|
|                   0|             3315709|                   0|                   0|
|                   0|               13295|                   0|             2901482|
|                   0|               22685|                   0|             2653627|
|                   0|             1575256|                   0|              788616|
+--------------------+--------------------+-----------

#### Convert the number strings to float data type

In [446]:
dff = dff.withColumn('_c8', dff._c8.cast(FloatType()))
dff = dff.withColumn('_c9', dff._c9.cast(FloatType()))
dff = dff.withColumn('_c10', dff._c10.cast(FloatType()))
dff = dff.withColumn('_c11', dff._c11.cast(FloatType()))

dff = dff.withColumn('Latitude', dff.Latitude.cast(FloatType()))
dff = dff.withColumn('Longitude', dff.Longitude.cast(FloatType()))

In [447]:
dff.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Latitude: float (nullable = true)
 |-- Longitude: float (nullable = true)
 |-- _c8: float (nullable = true)
 |-- _c9: float (nullable = true)
 |-- _c10: float (nullable = true)
 |-- _c11: float (nullable = true)
 |-- ID: long (nullable = false)



#### columns '_c8' and '_c9' need to be added together and renamed, same thing for columns '_c10' and '_c11'

In [448]:
#from pyspark.sql.functions import sum
dff_sum = dff.withColumn('Delivered', dff._c8 + dff._c9).withColumn('Surface', dff._c10 + dff._c11)

In [449]:
dff_sum.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Latitude: float (nullable = true)
 |-- Longitude: float (nullable = true)
 |-- _c8: float (nullable = true)
 |-- _c9: float (nullable = true)
 |-- _c10: float (nullable = true)
 |-- _c11: float (nullable = true)
 |-- ID: long (nullable = false)
 |-- Delivered: float (nullable = true)
 |-- Surface: float (nullable = true)



In [450]:
dff_sum.show(5)

+--------+---------------+-------------+-------------+--------+---------+----+---------+----+----+---+---------+-------+
|    City|          State|      Country|       Region|Latitude|Longitude| _c8|      _c9|_c10|_c11| ID|Delivered|Surface|
+--------+---------------+-------------+-------------+--------+---------+----+---------+----+----+---+---------+-------+
|    City|State/ Province|      Country|       Region|    null|     null|null|     null|null|null|  4|     null|   null|
|   Blair|             NE|United States|NORTH AMERICA|    42.0|    -96.0| 0.0| 688905.0| 0.0| 0.0|  5| 688905.0|    0.0|
|   Blair|             NE|United States|NORTH AMERICA|    42.0|    -96.0| 0.0|9488173.0| 0.0| 0.0|  6|9488173.0|    0.0|
| Memphis|             TN|United States|NORTH AMERICA|    35.0|    -90.0| 0.0|7037363.0| 0.0| 0.0|  7|7037363.0|    0.0|
|Songyuan|          Jilin|        China| ASIA/PACIFIC|    45.0|    125.0| 0.0|3315709.0| 0.0| 0.0|  8|3315709.0|    0.0|
+--------+---------------+------

#### Now columns '_c8', '_c9', '_c10', '_c11' need to be removed

In [451]:
dff2 = dff_sum.drop('_c8', '_c9', '_c10', '_c11')

In [452]:
dff2.show(7)

+-----------+---------------+-------------+-------------+--------+---------+---+---------+---------+
|       City|          State|      Country|       Region|Latitude|Longitude| ID|Delivered|  Surface|
+-----------+---------------+-------------+-------------+--------+---------+---+---------+---------+
|       City|State/ Province|      Country|       Region|    null|     null|  4|     null|     null|
|      Blair|             NE|United States|NORTH AMERICA|    42.0|    -96.0|  5| 688905.0|      0.0|
|      Blair|             NE|United States|NORTH AMERICA|    42.0|    -96.0|  6|9488173.0|      0.0|
|    Memphis|             TN|United States|NORTH AMERICA|    35.0|    -90.0|  7|7037363.0|      0.0|
|   Songyuan|          Jilin|        China| ASIA/PACIFIC|    45.0|    125.0|  8|3315709.0|      0.0|
|    Krefeld|        GERMANY|      Germany|       EUROPE|    51.0|      7.0|  9|  13295.0|2901482.0|
|Castelmassa|          ITALY|        Italy|       EUROPE|    45.0|     11.0| 10|  22685.0|2

####  Remove the original csv header

In [453]:
dff3 = dff2.filter(dff2.ID != 4)

In [454]:
dff3.show(10)

+------------+-----------+--------------+-------------+--------+---------+---+---------+---------+
|        City|      State|       Country|       Region|Latitude|Longitude| ID|Delivered|  Surface|
+------------+-----------+--------------+-------------+--------+---------+---+---------+---------+
|       Blair|         NE| United States|NORTH AMERICA|    42.0|    -96.0|  5| 688905.0|      0.0|
|       Blair|         NE| United States|NORTH AMERICA|    42.0|    -96.0|  6|9488173.0|      0.0|
|     Memphis|         TN| United States|NORTH AMERICA|    35.0|    -90.0|  7|7037363.0|      0.0|
|    Songyuan|      Jilin|         China| ASIA/PACIFIC|    45.0|    125.0|  8|3315709.0|      0.0|
|     Krefeld|    GERMANY|       Germany|       EUROPE|    51.0|      7.0|  9|  13295.0|2901482.0|
| Castelmassa|      ITALY|         Italy|       EUROPE|    45.0|     11.0| 10|  22685.0|2653627.0|
|  Manchester|    ENGLAND|United Kingdom|       EUROPE|    53.0|     -2.0| 11|1575256.0| 788616.0|
|Sas van G

In [455]:
dff3 = dff3.drop('ID')

In [456]:
dff3.show(10)

+------------+-----------+--------------+-------------+--------+---------+---------+---------+
|        City|      State|       Country|       Region|Latitude|Longitude|Delivered|  Surface|
+------------+-----------+--------------+-------------+--------+---------+---------+---------+
|       Blair|         NE| United States|NORTH AMERICA|    42.0|    -96.0| 688905.0|      0.0|
|       Blair|         NE| United States|NORTH AMERICA|    42.0|    -96.0|9488173.0|      0.0|
|     Memphis|         TN| United States|NORTH AMERICA|    35.0|    -90.0|7037363.0|      0.0|
|    Songyuan|      Jilin|         China| ASIA/PACIFIC|    45.0|    125.0|3315709.0|      0.0|
|     Krefeld|    GERMANY|       Germany|       EUROPE|    51.0|      7.0|  13295.0|2901482.0|
| Castelmassa|      ITALY|         Italy|       EUROPE|    45.0|     11.0|  22685.0|2653627.0|
|  Manchester|    ENGLAND|United Kingdom|       EUROPE|    53.0|     -2.0|1575256.0| 788616.0|
|Sas van Gent|NETHERLANDS|   Netherlands|       EU

In [457]:
dff3.describe('Delivered').show()

+-------+------------------+
|summary|         Delivered|
+-------+------------------+
|  count|               141|
|   mean| 463472.3475177305|
| stddev|1235078.1644725103|
|    min|               0.0|
|    max|         9488173.0|
+-------+------------------+



#### List the cities with top 'Delivered' values and top 'Surface' values

In [458]:
from pyspark.sql.functions import asc, desc
Delivered_desc = dff3.select('City', 'State', 'Country', 'Delivered').sort(desc('Delivered'))
Surface_desc = dff3.select('City', 'State', 'Country', 'Surface').sort(desc('Surface'))

In [459]:
Delivered_desc.show()

+------------+-----------+--------------+---------+
|        City|      State|       Country|Delivered|
+------------+-----------+--------------+---------+
|       Blair|         NE| United States|9488173.0|
|     Memphis|         TN| United States|7037363.0|
|      Dayton|         OH| United States|6188027.0|
|    Songyuan|      Jilin|         China|3315709.0|
|Cedar Rapids|         IA| United States|3056426.0|
|  Wapello Co|         IA| United States|2381374.0|
|Sas van Gent|NETHERLANDS|   Netherlands|2358100.0|
|  High River|         AB|        Canada|2281672.0|
|  Fort Morga|         CO| United States|2142050.0|
|      Dayton|         VA| United States|1661939.0|
|  Springdale|         AR| United States|1577397.0|
|  Manchester|    ENGLAND|United Kingdom|1575256.0|
|       Barby|    GERMANY|       Germany|1543247.0|
|   Plainview|         TX| United States|1505837.0|
|    Bielany |     POLAND|        Poland|1308366.0|
|      Bergen|NETHERLANDS|   Netherlands|1246625.0|
|  Californi

In [460]:
Surface_desc.show()


+--------------------+------------+-------------+---------+
|                City|       State|      Country|  Surface|
+--------------------+------------+-------------+---------+
|           Eddyville|          IA|United States|7047098.0|
|          Spiritwood|          ND|United States|3262601.0|
|             Krefeld|     GERMANY|      Germany|2901482.0|
|              Franca|      BRAZIL|       Brazil|2717358.0|
|         Castelmassa|       ITALY|        Italy|2653627.0|
|            Wahpeton|          ND|United States|2538523.0|
|          Dodge City|          KS|United States|2517785.0|
|            Schuyler|          NE|United States|2505549.0|
|          Beardstown|          IL|United States|2199964.0|
|              Friona|          TX|United States|2033228.0|
|             Hammond|          IN|United States|1751402.0|
|Martorell/Sante D...|       SPAIN|        Spain|1710833.0|
|             Efremov|      RUSSIA|       Russia|1495168.0|
|           Eddyville|          IA|Unite

### Example of using sqlContext.read.load()

#### Read and load csv file with sqlContext.read.load()

In [383]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

In [384]:
df2 = sqlContext.read.load("/Users/binggangliu/Downloads/CalEvent.csv",
                           format = 'com.databricks.spark.csv',
                           header = 'true',
                           inferSchema = 'true',
                           nullValue = 'NA')

In [385]:
df2.printSchema()

root
 |-- Start Date : string (nullable = true)
 |-- Start Time: string (nullable = true)
 |-- End Date: string (nullable = true)
 |-- End Time: string (nullable = true)
 |-- Event Title : string (nullable = true)
 |-- All Day Event: string (nullable = true)
 |-- No End Time: string (nullable = true)
 |-- Event Description: string (nullable = true)
 |-- Contact : string (nullable = true)
 |-- Contact Email: string (nullable = true)
 |-- Contact Phone: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Category: integer (nullable = true)
 |-- Mandatory: string (nullable = true)
 |-- Registration: string (nullable = true)
 |-- Maximum: integer (nullable = true)
 |-- Last Date To Register: string (nullable = true)



In [386]:
df3 = df2.withColumnRenamed('Last Date To Register', 'DeadEnd')

In [387]:
df3.printSchema()

root
 |-- Start Date : string (nullable = true)
 |-- Start Time: string (nullable = true)
 |-- End Date: string (nullable = true)
 |-- End Time: string (nullable = true)
 |-- Event Title : string (nullable = true)
 |-- All Day Event: string (nullable = true)
 |-- No End Time: string (nullable = true)
 |-- Event Description: string (nullable = true)
 |-- Contact : string (nullable = true)
 |-- Contact Email: string (nullable = true)
 |-- Contact Phone: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Category: integer (nullable = true)
 |-- Mandatory: string (nullable = true)
 |-- Registration: string (nullable = true)
 |-- Maximum: integer (nullable = true)
 |-- DeadEnd: string (nullable = true)



In [388]:
df4 = df3.select("DeadEnd").distinct().show()

+-------+
|DeadEnd|
+-------+
| 9/2/11|
+-------+



In [73]:
df4 = df3.select('Start Date ', 'Event Title ').distinct().show()

+-----------+--------------------+
|Start Date |        Event Title |
+-----------+--------------------+
|     9/5/11|Social Studies De...|
|     9/5/11|  Curriculum Meeting|
+-----------+--------------------+

