In [2]:
#spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Structured APIs').master("local").getOrCreate()

In [3]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("users.csv", format="csv", sep=",", inferSchema="true", header="true")


In [4]:
# Printing the schema 
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [5]:
# Showing the elements of the dataframe
df.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [6]:
#Specifying the schema instead of inferring it 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, DoubleType, LongType

fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', LongType(),True),
                        StructField('job_title', StringType(),True)])

df2 = spark.read.load("users.csv", format="csv", sep=",", schema = fileSchema, header="true")


In [7]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- job_title: string (nullable = true)



In [8]:
df2.show()

+-------+---+----------+
|   name|age| job_title|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [9]:
#Storing this dataframe in parquet
df.write.parquet("users_2.parquet")

In [10]:
#Storing this dataframe in json format
df.write.json("users_2.json")

In [11]:
#Storing this dataframe in orc format
df.write.orc("users_2.orc")

In [12]:
# Creating dataframe from the JSON file and infering the schema
df=spark.read.json("users_2.json")

In [13]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = true)
 |-- name: string (nullable = true)



In [14]:
df.show()

+---+----------+-------+
|age|       job|   name|
+---+----------+-------+
| 61|  Engineer| Vishwa|
| 79|    Doctor|  Mohan|
| 21|   Student|Rishavv|
| 69|Consultant|Shivani|
| 35| Cricketer| Sachin|
| 31|   Captain|  Rohit|
| 32|   Blogger|  Virat|
| 45|     Actor| Akshay|
| 70| Superstar|Amitabh|
+---+----------+-------+



In [15]:
#Specifying the schema in the case of json
fileSchema = StructType([StructField('name', StringType(),True),
                        StructField('age', IntegerType(),True),
                        StructField('job', StringType(),True)])

df2 = spark.read.json("users_2.json", schema = fileSchema) 


In [16]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [17]:
df2.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [18]:
# Reading the data from the parquet file
df = spark.read.parquet("users_2.parquet")

In [19]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [20]:
df.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [21]:
# Reading the data from the orc file
df = spark.read.orc("users_2.orc")

In [22]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [23]:
df.show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [9]:
sc.install_pypi_package("pandas==0.25.1")
sc.list_packages()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package already installed for current Spark context!
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 1110, in install_pypi_package
    raise ValueError("Package already installed for current Spark context!")
ValueError: Package already installed for current Spark context!



In [24]:
# setting up the Pandas
import numpy as np
import pandas as pd

# Enable Arrow-based columnar data transfers
#spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [25]:
# Generating the pandas dataframe
pdf = pd.DataFrame(np.random.rand(10, 3))

In [26]:
print(pdf)

          0         1         2
0  0.137790  0.922133  0.123957
1  0.333376  0.580820  0.537393
2  0.904449  0.810433  0.479234
3  0.953199  0.540097  0.837223
4  0.828297  0.403286  0.961653
5  0.158290  0.019993  0.851900
6  0.530120  0.729136  0.445259
7  0.390467  0.405339  0.525516
8  0.247529  0.782043  0.519066
9  0.896800  0.852220  0.395960


In [27]:
# Generating the Data frames from the Pandas df
#!pip install pyarrow
df = spark.createDataFrame(pdf)

In [28]:
df.printSchema()
df.show()

root
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)

+-------------------+-------------------+-------------------+
|                  0|                  1|                  2|
+-------------------+-------------------+-------------------+
| 0.1377895978491671| 0.9221329979374493|0.12395744510215878|
| 0.3333761883592037|   0.58081961105366| 0.5373933805243502|
| 0.9044492722416171| 0.8104334656033485| 0.4792336229672418|
| 0.9531987807211716| 0.5400969836419792| 0.8372231053420505|
| 0.8282967967547646|0.40328575729156513| 0.9616529099359863|
|0.15829049487080626|0.01999262568693161| 0.8518999625814911|
| 0.5301195926072625| 0.7291359184343568|0.44525864523163383|
|0.39046747599154874| 0.4053385070658696| 0.5255157570818327|
|0.24752913927709286| 0.7820433310400476|  0.519066410203265|
| 0.8968000042943823|   0.85222036664126| 0.3959601229881846|
+-------------------+-------------------+-------------------+



In [29]:
result_pdf = df.select("*").toPandas()
print(result_pdf)

          0         1         2
0  0.137790  0.922133  0.123957
1  0.333376  0.580820  0.537393
2  0.904449  0.810433  0.479234
3  0.953199  0.540097  0.837223
4  0.828297  0.403286  0.961653
5  0.158290  0.019993  0.851900
6  0.530120  0.729136  0.445259
7  0.390467  0.405339  0.525516
8  0.247529  0.782043  0.519066
9  0.896800  0.852220  0.395960


In [30]:
######. operations on data frame 

In [31]:
### Select all columns in dataframe
df = spark.read.orc("users_2.orc")
df.select("*").show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
| Vishwa| 61|  Engineer|
|  Mohan| 79|    Doctor|
|Rishavv| 21|   Student|
|Shivani| 69|Consultant|
| Sachin| 35| Cricketer|
|  Rohit| 31|   Captain|
|  Virat| 32|   Blogger|
| Akshay| 45|     Actor|
|Amitabh| 70| Superstar|
+-------+---+----------+



In [32]:
### Selecting specific column in dataframe
df.select("name").show()

+-------+
|   name|
+-------+
| Vishwa|
|  Mohan|
|Rishavv|
|Shivani|
| Sachin|
|  Rohit|
|  Virat|
| Akshay|
|Amitabh|
+-------+



In [33]:
# Filter operations
df.filter(df['age']>50).collect()

[Row(name='Vishwa', age=61, job='Engineer'),
 Row(name='Mohan', age=79, job='Doctor'),
 Row(name='Shivani', age=69, job='Consultant'),
 Row(name='Amitabh', age=70, job='Superstar')]

In [34]:
# Group by
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 31|    1|
| 61|    1|
| 35|    1|
| 69|    1|
| 45|    1|
| 70|    1|
| 21|    1|
| 32|    1|
| 79|    1|
+---+-----+



In [35]:
#Order by
df.orderBy(df.age.desc()).show()

+-------+---+----------+
|   name|age|       job|
+-------+---+----------+
|  Mohan| 79|    Doctor|
|Amitabh| 70| Superstar|
|Shivani| 69|Consultant|
| Vishwa| 61|  Engineer|
| Akshay| 45|     Actor|
| Sachin| 35| Cricketer|
|  Virat| 32|   Blogger|
|  Rohit| 31|   Captain|
|Rishavv| 21|   Student|
+-------+---+----------+



In [None]:
################################  Spark SQL   ##########################