Here, we will solve problems two ways
1. First using PySpark function 
2. Second using Spark SQL

In [1]:
# First Load all the required library and also Start Spark Session
# Load all the required library
from pyspark.sql import SparkSession

In [2]:
#Start Spark Session
spark = SparkSession.builder.appName("problem5").getOrCreate()
sqlContext = SparkSession(spark)
#Dont Show warning only error
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/09 11:10:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#Load CSV file into DataFrame
stationdf = spark.read.format("csv").option("header","true").option("inferSchema","true").load("station.csv")

                                                                                

In [4]:
#Check Schema of DataFrame
stationdf.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [5]:
#Check sample Data 
stationdf.show()

+---+-----------+-----+-----------+-----------+
| ID|       City|State|  Lattitude|  Longitude|
+---+-----------+-----+-----------+-----------+
|478|     Tipton|   IN|33.54792701|97.94286036|
|619|  Arlington|   CO|75.17993079|92.94615894|
|711|     Turner|   AR|50.24380534|101.4580163|
|839|    Slidell|   LA|85.32270304|151.8743276|
|411|    Negreet|   LA| 98.9707194|105.3376115|
|588|    Glencoe|   KY|46.38739244|136.0427027|
|665|    Chelsea|   IA|98.72210937|59.68913002|
|733|Pelahatchie|   MS|38.58161595|28.11950703|
|811|   Dorrance|   KS|102.0888316|121.5614372|
|698|     Albany|   CA|49.75112765|80.21211317|
|325|   Monument|   KS|70.52300953|141.7680413|
|414| Manchester|   MD|73.51580724|37.14602869|
|113|   Prescott|   IA|39.93234421|65.79327823|
|971|Graettinger|   IA|94.66283665|150.3826243|
|266|     Cahone|   CO|116.2321963| 127.009554|
|617|    Sturgis|   MS|36.45673517|126.1690696|
|495|    Upperco|   MD|114.2157413|29.63104758|
|473|   Highwood|   IL|27.25445814|150.9

In [17]:
#Solving Problem using PySpark 
# ind the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. 
stationdf.select("City").where("Left(City,1) IN ('A','E','I','O','U')").show(n=100)

+-------------+
|         City|
+-------------+
|    Arlington|
|       Albany|
|      Upperco|
|      Aguanga|
|         Odin|
|      Algonac|
|       Onaway|
|    Irvington|
|   Arrowsmith|
|        Udall|
|     Oakfield|
|       Elkton|
|          Amo|
|      Alanson|
|       Eleele|
|       Auburn|
|       Oconee|
|     Amazonia|
|Andersonville|
|         Eros|
|  Arkadelphia|
|      Eriline|
|    Edgewater|
|     Eastlake|
|      Addison|
|      Everton|
|       Eustis|
|       Arispe|
|    Ottertail|
|       Ermine|
|       Albion|
|       Athens|
|      Eufaula|
|      Andover|
|      Osborne|
|      Oshtemo|
+-------------+



In [6]:
# Now we are solving Same problem using Spark SQL 
# Creating Temp Table or HIVE table
stationdf.createOrReplaceTempView("tmpStation")

In [7]:
# Now we have SQL Table and we can write SQL Query on top of that 
# For example by Select on table 
sqlContext.sql("SELECT * FROM tmpStation").show()

+---+-----------+-----+-----------+-----------+
| ID|       City|State|  Lattitude|  Longitude|
+---+-----------+-----+-----------+-----------+
|478|     Tipton|   IN|33.54792701|97.94286036|
|619|  Arlington|   CO|75.17993079|92.94615894|
|711|     Turner|   AR|50.24380534|101.4580163|
|839|    Slidell|   LA|85.32270304|151.8743276|
|411|    Negreet|   LA| 98.9707194|105.3376115|
|588|    Glencoe|   KY|46.38739244|136.0427027|
|665|    Chelsea|   IA|98.72210937|59.68913002|
|733|Pelahatchie|   MS|38.58161595|28.11950703|
|811|   Dorrance|   KS|102.0888316|121.5614372|
|698|     Albany|   CA|49.75112765|80.21211317|
|325|   Monument|   KS|70.52300953|141.7680413|
|414| Manchester|   MD|73.51580724|37.14602869|
|113|   Prescott|   IA|39.93234421|65.79327823|
|971|Graettinger|   IA|94.66283665|150.3826243|
|266|     Cahone|   CO|116.2321963| 127.009554|
|617|    Sturgis|   MS|36.45673517|126.1690696|
|495|    Upperco|   MD|114.2157413|29.63104758|
|473|   Highwood|   IL|27.25445814|150.9

In [9]:
# Now we will write query to get max salary for each employee 
# so we will use SQL Group by and SQL Order by functions 
sqlContext.sql("SELECT DISTINCT(CITY) FROM tmpStation WHERE LEFT(CITY,1) IN ('A','E','I','O','U')").show(n=100)

+-------------+
|         CITY|
+-------------+
|       Auburn|
|Andersonville|
|     Eastlake|
|       Albany|
|      Aguanga|
|       Onaway|
|      Andover|
|      Algonac|
|     Amazonia|
|  Arkadelphia|
|       Arispe|
|       Eustis|
|        Udall|
|       Athens|
|    Ottertail|
|      Upperco|
|       Ermine|
|      Eufaula|
|      Alanson|
|    Arlington|
|   Arrowsmith|
|      Oshtemo|
|    Irvington|
|       Elkton|
|       Eleele|
|       Oconee|
|     Oakfield|
|          Amo|
|      Addison|
|       Albion|
|      Everton|
|      Osborne|
|      Eriline|
|    Edgewater|
|         Eros|
|         Odin|
+-------------+

