In [None]:
import findspark

findspark.init("/opt/cloudera/parcels/CDH-6.2.1-1.cdh6.2.1.p0.1425774/lib/spark")

import pyspark

sc= pyspark.SparkContext()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.format("csv") \
                         .option("header", "true") \
                         .option("inferSchema", "true") \
                         .load("/user/glbigdata12/Cars_Sale.csv")


In [None]:
df.printSchema()

In [None]:
print(df)

In [None]:
df.show()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, FloatType, LongType, IntegerType, DateType

# define the structure
schema = StructType([
    StructField("Manufacturer", StringType(),True),
    StructField("Model", StringType(),True),
    StructField("Vehicle_type", StringType(),True),
    StructField("Latest_Launch", StringType(),True),
    StructField("Units_Sold", DoubleType(),True),
    StructField("Units_Price", DoubleType(),True),
    StructField("Cost_incurred", DoubleType(),True),
    StructField("Revenue", DoubleType(),True),
    StructField("Cost", DoubleType(),True),
    StructField("Profit", DoubleType(),True)
])



# read the file by using the defined schema
df1 = spark.read.format("csv").option("header", "true").schema(schema).load("/user/glbigdata12/Cars_Sale.csv")

# display the schema
df1.printSchema()


In [None]:
df1.show()

In [None]:
from pyspark.sql.functions import col, column
import pyspark.sql.functions as F

# select a few columns
df1.select("Manufacturer", "Vehicle_type", F.col("Model"), "Latest_Launch", "Units_Sold", "Revenue", F.lit('DefaultValue')).show(4, False)

In [None]:
df1.select("*").show(4, False)

In [None]:
df1.columns

In [None]:
df2 = df1.withColumnRenamed("Vehicle_type", "VehicleType").withColumnRenamed('Units_Sold','units_sold').withColumnRenamed('Units_Price','units_price').withColumnRenamed('Cost_incurred','units_cost')

In [None]:
df2.show(3, False)

In [None]:
df1.show(4, False)

In [None]:
# adding columns to a dataframe
import pyspark.sql.functions as F

# add a new column "Register_Site" with default value "www.google.com"
dataDF = df1.withColumn("Register_Site", F.lit("www.google.com"))

# display only a few columns
dataDF.select("Manufacturer", "Vehicle_type","Model", "Register_Site").show(3, False)

In [None]:
# removing columns from a DataFrame

# number of columns in a dataframe - before removing columns
print("Number of columns : ", len(dataDF.columns))

# columns - before dropping
print(list(dataDF.columns))

# drop columns - "Vehicle_type", "Model"
datanewDF = dataDF.drop("Vehicle_type", "Model")

# number of columns in a dataframe - after removing columns
print("Number of columns : ", len(datanewDF.columns))

# columns - after dropping
print(list(datanewDF.columns))

In [None]:
# arithmetic with dataframes
# number of columns in a dataframe - before a adding a column
print("Number of columns : ", len(df1.columns))

# perform arithmetic operations on a dataframe column
newDF = df1.withColumn("TotalSale", col("Units_Sold") * col("Units_Price"))

# number of columns in a dataframe - after adding columns
print("Number of columns : ", len(newDF.columns))

# display records
newDF.show(3)

In [None]:
# filter a dataframe

df1.where(col("Manufacturer") == "Cadillac").show(5)

In [None]:
# filter a dataframe - multiple columns

df1.where((col("Manufacturer") == "Cadillac") & (col("Vehicle_type") == "Passenger")).show(5)

In [None]:
# dropping rows
testDF = [[1, "January"], [2, "February"], [1, "January"], [3, "March"], [3, "March"], [3, "March"], [4, "April"], [4, "April"], [5, "May"], [5, "May"],
          [4, "April"], [6, "June"], [5, "April"]]

# import the modules
from pyspark.sql.types import *

# define the schema
schema = StructType([StructField("ID", IntegerType()),StructField("Month", StringType())])

# create the dataframe by applying schema
df_new = spark.createDataFrame(testDF,schema=schema) 

# display the records
df_new.show()

In [None]:
# display distinct rows
df_new.distinct().show()

In [None]:
# drop duplicate records based a column value
df_new.dropDuplicates(['Month']).show()

# drop duplicate records based multiple column values
df_new.dropDuplicates(['Month', 'ID']).show()

In [None]:
# rename existing columns
newDF1 = df1.withColumnRenamed("Units_Price", "UnitPrice").withColumnRenamed("Profit", "Total_Profit")

df1.show(3) # display records

from pyspark.sql.functions import expr # define the modules

# using select expression 
newDF1.select("Manufacturer", "Model",expr("CASE WHEN Total_Profit > 104 THEN  'Good' ELSE 'Average' END AS value_desc")).show(3)

In [None]:
from pyspark.sql.types import *   # import the libraries

# define a list
list_data = [["Bill Gates",23],["Henry Ford", None], ["Tim Cook", None]]

# define the schema
schema = StructType([StructField("Name", StringType()),StructField("Experience", IntegerType())])

# create a dataframe 
df_new = spark.createDataFrame(list_data,schema=schema)

df_new.show() # display the dataframe

In [None]:
# drop null value rows
df_new.na.drop().show()

In [None]:
# fill null value with a constant value
df_new.fillna(34).show()

In [None]:
# replace a single value
df_new.na.replace('Bill Gates', 'Satya Nadella').show()

In [None]:
# replace multiple values and also fill 'null' with a constant value
df_new.na.replace(['Bill Gates', 'Tim Cook'], ['Satya N', 'Time'], 'Name').fillna(40).show()

In [None]:
# rename the existing columns - "Profit" to "Total_Profit"
newDF1 = df1.withColumnRenamed("Profit", "Total_Profit")

# find maximum total_profit for each region and alias the column to "Maximum"
newDF1.groupBy("Manufacturer").max("Total_Profit").alias("Maximum").show(10, False)

In [None]:
# count of models by each manufacturer
newDF1.groupBy("Manufacturer").agg({'Model':'count'}).show(10, False)

In [None]:
from pyspark.sql.functions import avg # include the library

# find average of column - "Total_Profit" 
newDF1.select(avg("Total_Profit").alias("Average Profit")).show()

In [None]:
# include the library
from pyspark.sql.functions import col

# order the records by manufacturer - ascending
df1.orderBy('Manufacturer', ascending=True).select("Manufacturer","Model","Vehicle_type", "Profit").show(3)

In [None]:
# include the library
from pyspark.sql.functions import col

# order the records by manufacturer - desc
df1.orderBy('Manufacturer', ascending=False).select("Manufacturer","Model","Vehicle_type", "Profit").show(3)

In [None]:
# cache and persist
from pyspark import StorageLevel

# cache the dataframe in in-memory
cacheDF = df1.cache()

# read the records from cache
cacheDF.select("Manufacturer", "Model", "Vehicle_type",  \
               "Latest_Launch").show(4, truncate=False)

In [None]:
# cache and persist
from pyspark import StorageLevel

# persist the dataframe in both memo
persistDF = df1.persist(StorageLevel.MEMORY_AND_DISK)

# read the records from saved dataframe
persistDF.select("Manufacturer", "Model", "Vehicle_type",  \
               "Latest_Launch").show(4, truncate=False)

In [None]:
# coalesce vs repartition
print("Number of partitions : ", df1.rdd.getNumPartitions())

# increase the number of partitions
cDF = df1.repartition(2)

# number of partitions after repatitioning
print("Number of partitions : ", cDF.rdd.getNumPartitions())

# reduce the number of partitions
cDF = cDF.coalesce(1)

# number of partitions after coalesce
print("Number of partitions : ", cDF.rdd.getNumPartitions())

In [None]:
# aggregates the Vehicle Type count by Manufacturer, brings the data to a single partition
writeDF = newDF1.groupBy("Manufacturer").agg({'Model':'count'}).coalesce(1)  

# write to DBFS - mode: "overwrite" replaces the existing file and "append" adds the content
writeDF.write.option("header","true").option("sep",",").mode("overwrite").csv("/user/glbigdata12/Aggregate/")

In [None]:
#%fs ls "/user/glbigdata12/Aggregate"

In [None]:
# read the csv file
newDF1 = spark.read.format("csv").option("header", "true").option("inferSchema", "true") \
   .load("/user/glbigdata12/Aggregate/part-00000-1de425a3-41cf-46d1-8fe8-c74b9d62149f-c000.csv")

# display the records
newDF1.show(10, False)

In [None]:
# spark SQL
# create a DataFrame
from pyspark.sql.types import *   # import the library
leader_data = [["Dodge","Mohammed Saif"],["Cadillac", "George Carlin"], \
               ["BMW", "Stuart Broad"], ["Ford", "Abdalla"], ["Hyundai", "Chris Gayle"], \
               ["Lexus", "George Bush"], ["Mercury", "Tatyaso Martin"]]

# define the schema
schema = StructType([StructField("Manufacturer", StringType()), StructField("SalesPerson", StringType())])

# create a dataframe and display the records
df_new = spark.createDataFrame(leader_data,schema=schema)
df_new.show(10, False)

In [None]:
df_new.createOrReplaceTempView("sales_table")  # convert dataframe to view

# write sql queries using sql()
spark.sql("select * from sales_table").show(10, False)

In [None]:
spark.sql("select * from sales_table where Manufacturer = 'Cadillac'").show(10, False)

In [None]:
spark.sql("select * from sales_table where SalesPerson like '%George%'").show(10, False)

In [None]:
spark.sql("select count(*) from sales_table").show()

In [None]:
df1.createOrReplaceTempView("vehicle")

spark.sql("select * from vehicle").show(1, False)

In [None]:
# renaming a column using DSL
newDF1 = df1.withColumnRenamed("Revenue", "TotalRevenue")

# create a temp view

newDF1.createOrReplaceTempView("vehicle")

# apply aggregations on the table data
spark.sql("select Manufacturer, max(TotalRevenue) from vehicle group by Manufacturer").show(truncate=False)

In [None]:
spark.sql("select Manufacturer, max(TotalRevenue) from vehicle group by Manufacturer order by Manufacturer").show(truncate=False)

In [None]:
spark.sql("select Manufacturer, max(TotalRevenue) from vehicle group by Manufacturer order by Manufacturer desc").show(truncate=False)

In [None]:
# join (inner) vehicle and sales_table, display the results
spark.sql("""select a.Manufacturer, a.Model, b.SalesPerson
       from vehicle a
       join sales_table b
       on trim(a.Manufacturer) = trim(b.Manufacturer)""").show(5, False)

In [None]:
# join (inner) vehicle and sales_table, apply a where condition, display the results
df_new = spark.sql("""select a.Manufacturer, a.Model, b.SalesPerson
       from vehicle a
       join sales_table b
       on trim(a.Manufacturer) = trim(b.Manufacturer)
       where trim(a.Manufacturer) = "Cadillac"
       """).show(5, False)

In [None]:
# write the results in to DBFS
df_new = spark.sql("""select a.Manufacturer, a.Model, b.SalesPerson
       from vehicle a
       join sales_table b
       on trim(a.Manufacturer) = trim(b.Manufacturer)
       where trim(a.Manufacturer) = "Cadillac"
       """)


df_new.coalesce(1).write.option("header","true").mode("overwrite").csv("/user/glbigdata12/spark/")

In [None]:
#%fs ls "/user/glbigdata12/spark/" 