### Create Single Column DF from iterables

In [0]:
ages_list = [12,21,1]

In [0]:
spark

In [0]:
help(spark.createDataFrame)

In [0]:
spark.createDataFrame(ages_list)
#Not to do it - 1.

In [0]:
spark.createDataFrame(ages_list,['Age'])
#Not to do it - 2.

In [0]:
spark.createDataFrame(ages_list,'int',['Age'])  

#this is when we're creating DataFrame from iterables, we've to provide data type for the same.

Out[3]: DataFrame[value: int]

In [0]:
#Another way to do it would be

from pyspark.sql.types import IntegerType
spark.createDataFrame(ages_list,IntegerType(),['Age'])

Out[4]: DataFrame[value: int]

In [0]:
spark.createDataFrame(['A','B','C'],'string',['Chars'])

Out[5]: DataFrame[value: string]

In [0]:
spark.createDataFrame(('A','B','C'),['Chars'])
#Not to do it - 3.

###Create Multi-Column Spark DF

>It's as same as converting list of tuples-dictionary into Pandas DF. With this way we don't have to specify schema name.
**Only difference would be adding ADDITIONAL ',' in the end**

In [0]:
import pandas as pd
x = [(21),(21),(31)]
print(pd.DataFrame(x))



    0
0  21
1  21
2  31


In [0]:
x = [(21,),(21,),(31,)]
spark.createDataFrame(x)

Out[7]: DataFrame[_1: bigint]

In [0]:
x = [{'A':21},{'A':22},{'A':23}]
spark.createDataFrame(x)

Out[8]: DataFrame[A: bigint]

In [0]:
x = [(21,'A'),(21,'B'),(31,'C')]
spark.createDataFrame(x)

Out[9]: DataFrame[_1: bigint, _2: string]

In [0]:
x = [(1,'Android'),(2,'Apple'),(3,'BlackBerry')]
df = spark.createDataFrame(x,'Id int,Software string')

###Row-Column manipulation

In [0]:
df.show()

+---+----------+
| Id|  Software|
+---+----------+
|  1|   Android|
|  2|     Apple|
|  3|BlackBerry|
+---+----------+



In [0]:
df.collect() #to convert into list

Out[12]: [Row(Id=1, Software='Android'),
 Row(Id=2, Software='Apple'),
 Row(Id=3, Software='BlackBerry')]

In [0]:
df_ = df.collect() #to convert into list

df_.append((4,'iOS'))
df = spark.createDataFrame(df_,'Id int,Software string')

In [0]:
df.show()

+---+----------+
| Id|  Software|
+---+----------+
|  1|   Android|
|  2|     Apple|
|  3|BlackBerry|
|  4|       iOS|
+---+----------+



> OR, Say we need to create a DF from list of lists. Let's use **pyspark**

In [0]:
from pyspark.sql import Row

x = [[1,'Android'],[2,'Apple'],[3,'BlackBerry']]
user_rows = [Row(*user) for user in x]
df = spark.createDataFrame(user_rows)
df.show()

+---+----------+
| _1|        _2|
+---+----------+
|  1|   Android|
|  2|     Apple|
|  3|BlackBerry|
+---+----------+



In [0]:
df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)



> **specifying schema/columns** as we do in Pandas

In [0]:
import datetime

users = [(1,
  'Harry',
  'Potter',
  'cv_harry@hogwarts.com',
  True,
  1000.55,
  datetime.date(2021, 1, 15),
  datetime.datetime(2021, 2, 10, 1, 15)),
 (2,
  'Walter',
  'White',
  'ww_cook@heisenberg.co.uk',
  True,
  900.0,
  datetime.date(2021, 2, 14),
  datetime.datetime(2021, 2, 18, 3, 33))]

In [0]:
users_schema = '''
    id INT,
    first_name STRING,
    last_name STRING,
    email STRING,
    is_customer BOOLEAN,
    amount_paid FLOAT,
    customer_from DATE,
    last_updated_ts TIMESTAMP
'''

In [0]:
usersdf_ = spark.createDataFrame(users, schema = users_schema)
usersdf_.show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
|  1|     Harry|   Potter|cv_harry@hogwarts...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|    Walter|    White|ww_cook@heisenber...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+



>we can do it by passing it as simple List or as Spark Types

In [0]:
from pyspark.sql.types import *

users_schema = StructType([
    StructField('id', IntegerType()),
    StructField('first_name', StringType()),
    StructField('last_name', StringType()),
    StructField('email', StringType()),
    StructField('is_customer', BooleanType()),
    StructField('amount_paid', FloatType()),
    StructField('customer_from', DateType()),
    StructField('last_updated_ts', TimestampType())])

**> Create a DF using pandas DF.**

In [0]:
import pandas as pd

pd.DataFrame(users,columns = usersdf_.columns).head()

Unnamed: 0,id,first_name,last_name,email,is_customer,amount_paid,customer_from,last_updated_ts
0,1,Harry,Potter,cv_harry@hogwarts.com,True,1000.55,2021-01-15,2021-02-10 01:15:00
1,2,Walter,White,ww_cook@heisenberg.co.uk,True,900.0,2021-02-14,2021-02-18 03:33:00


In [0]:
spark.createDataFrame(pd.DataFrame(users,columns = usersdf_.columns)).show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+
|  1|     Harry|   Potter|cv_harry@hogwarts...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|    Walter|    White|ww_cook@heisenber...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+



#### Spark special DataTypes 

* Here are the special types that are supported by Spark.
  * ARRAY
  * STRUCT
  * MAP
* Python structures such as list and dict can be implicitly converted to Spark ARRAY and MAP respectively.
* We need to use few Spark related APIs to convert Python data structures to STRUCT type.

###  Array

In [0]:
users = [{
        "id": 1,
        "first_name": "Harry",
        "last_name": "Potter",
        "email": "cv_harry@hogwarts.com",
        "phone_numbers": ["+1 234 567 8901", "+1 234 567 8911"],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Walter",
        "last_name": "white",
        "email": "ww_cook@heisenberg.co.uk",
        "phone_numbers": ["+1 234 567 8923", "+1 234 567 8934"],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    }]

In [0]:
usersdf_ = spark.createDataFrame(users)
usersdf_.show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+--------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|       phone_numbers|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+--------------------+
|    1000.55|   2021-01-15|cv_harry@hogwarts...|     Harry|  1|       true|   Potter|2021-02-10 01:15:00|[+1 234 567 8901,...|
|      900.0|   2021-02-14|ww_cook@heisenber...|    Walter|  2|       true|    white|2021-02-18 03:33:00|[+1 234 567 8923,...|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+--------------------+



In [0]:
usersdf_.select('id','phone_numbers').show()

+---+--------------------+
| id|       phone_numbers|
+---+--------------------+
|  1|[+1 234 567 8901,...|
|  2|[+1 234 567 8923,...|
+---+--------------------+



In [0]:
#this expolde kinds of de-normalise the dataFrame

from pyspark.sql.functions import explode

usersdf_.withColumn('phone_numbers',explode('phone_numbers')).\
    drop('phone_numbers').\
    show()

+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|amount_paid|customer_from|               email|first_name| id|is_customer|last_name|    last_updated_ts|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+
|    1000.55|   2021-01-15|cv_harry@hogwarts...|     Harry|  1|       true|   Potter|2021-02-10 01:15:00|
|    1000.55|   2021-01-15|cv_harry@hogwarts...|     Harry|  1|       true|   Potter|2021-02-10 01:15:00|
|      900.0|   2021-02-14|ww_cook@heisenber...|    Walter|  2|       true|    white|2021-02-18 03:33:00|
|      900.0|   2021-02-14|ww_cook@heisenber...|    Walter|  2|       true|    white|2021-02-18 03:33:00|
+-----------+-------------+--------------------+----------+---+-----------+---------+-------------------+



> ***Q: What's the difference between explode and explode_outer?***

In [0]:
#this will help to explore Array further with query

from pyspark.sql.functions import col

usersdf_.\
    select('id',col('phone_numbers')[0].alias('mobile'),col('phone_numbers')[1].alias('home')).\
    show()

+---+---------------+---------------+
| id|         mobile|           home|
+---+---------------+---------------+
|  1|+1 234 567 8901|+1 234 567 8911|
|  2|+1 234 567 8923|+1 234 567 8934|
+---+---------------+---------------+



###  Map 
> --  this will have dictionary instead of list and can be handled the same way as arrays
> For Ex: "phone_numbers": {"mobile": "+1 234 567 8923", "home": "+1 234 567 8934"},

###  Struct 
> --  this will rows instead of dict/list and can be handled the same way as arrays/Map
> For Ex: "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),