In [17]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import Row

In [18]:
spark = SparkSession.builder.appName('Spark').getOrCreate()

# Create DataFrame.

In [18]:
data_df = spark.createDataFrame(
    data=[
        ("David", 20),
        ("Ram", 31),
        ("Jules", 30),
        ("Rahul", 35),
        ("David", 25)
    ],
    schema=[
        "name",
        "age"
    ]
)

In [23]:
data_df = spark.createDataFrame(
    data=[
        ("Ravid", 20),
        ("Ram", 31),
        ("Jules", 30),
        ("Rahul", 35),
        ("Ravid", 25)
    ],
    schema="name string, age int"
)

In [33]:
data_df = spark.createDataFrame(
    data=[
        Row(name="Ravid", age=20),
        Row(name="Ram", age=31),
        Row(name="Jules", age=30),
        Row(name="Rahul", age=35),
        Row(name="Ravid", age=25)
    ]
)

# Reading File

In [6]:
data_df = spark.read.csv('fruits_data.csv', header=True)

# Viewing Data

In [7]:
data_df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [8]:
data_df.show(n=2)

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
+-----+------+---+---+
only showing top 2 rows



In [9]:
data_df.columns

['color', 'fruit', 'v1', 'v2']

In [12]:
data_df.select('color', 'fruit').show()

+-----+------+
|color| fruit|
+-----+------+
|  red|banana|
| blue|banana|
|  red|carrot|
| blue| grape|
|  red|carrot|
|black|carrot|
|  red|banana|
|  red| grape|
+-----+------+



# Schemas

A schema in spark defines the column names and associated data types for a DataFrame.

In [19]:
from pyspark.sql.types import *

In [20]:
schema = StructType(
    [
        StructField("author", StringType(), False),
        StructField("title", StringType(), False),
        StructField("page", IntegerType(), False)
    ]
)

#DDL

schema = "author STRING, title STRING, page INT"


In [27]:
data = [
    [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
    [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIn"]],
    [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web","twitter", "FB", "LinkedIn"]],
    [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
    [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web","twitter", "FB", "LinkedIn"]],
    [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,["twitter", "LinkedIn"]]
]

schema = """
`Id` INT, 
`First` STRING, 
`Last` STRING, 
`Url` STRING, 
`Published` STRING, 
`Hits` INT, 
`Campaigns` ARRAY<STRING>
"""

In [26]:
data_df = spark.createDataFrame(data=data, schema=schema)

In [29]:
schema = StructType(
    [
        StructField("Id", IntegerType(), False),
        StructField("First", StringType(), False),
        StructField("Last", StringType(), False),
        StructField("Url", StringType(), False),
        StructField("Published", StringType(), False),
        StructField("Hits", IntegerType(), False),
        StructField("Campaigns", ArrayType(StringType()), False)
    ]
)

In [30]:
data_df = spark.read.json("blogs.json", schema=schema)

In [31]:
data_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [23]:
data_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [24]:
data_df.schema

StructType([StructField('Id', IntegerType(), True), StructField('First', StringType(), True), StructField('Last', StringType(), True), StructField('Url', StringType(), True), StructField('Published', StringType(), True), StructField('Hits', IntegerType(), True), StructField('Campaigns', ArrayType(StringType(), True), True)])

# Stop SparkSession

In [32]:
spark.stop()