In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
        .builder \
        .appName("My-App") \
        .getOrCreate()

**SparkSession.createDataFrame**: Creates a DataFrame from an RDD, a list or a pandas.DataFrame.

**Parameters**:
  - data: rdd, list or pandas.DataFrame
  - schema:  
    
    - pyspark.sql.types.DataType
    - datatype string
    - list of column names  
    
  - samplingRatio
  - verifySchema

In [3]:
people_id_list = [('Alice',1),
    ('Bob',2),
    ('Cindy',3)]

### List

In [4]:
df_people_noschema = spark.createDataFrame(people_id_list)
df_people_noschema.collect()

[Row(_1='Alice', _2=1), Row(_1='Bob', _2=2), Row(_1='Cindy', _2=3)]

In [5]:
df_people = spark.createDataFrame(
    people_id_list,
    ['name', 'age'])
df_people.collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

In [6]:
df_people_2 = spark.createDataFrame(
    data=people_id_list,
    schema=['name', 'age'])
df_people_2.collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

## Dictionary

In [7]:
d = [{'name': 'Alice', 'age': 1}]
spark.createDataFrame(d).collect()



[Row(age=1, name='Alice')]

In [8]:
d = [{'name': 'Alice', 'age': 1},
    {'name': 'Bob', 'age': 2}]
spark.createDataFrame(d).collect()

[Row(age=1, name='Alice'), Row(age=2, name='Bob')]

#### This is NOT GOOD

In [9]:
d = [{'name': ['Alice','Bob'], 'age': [1,2]}]
spark.createDataFrame(d).collect()

[Row(age=[1, 2], name=['Alice', 'Bob'])]

### RDD

In [10]:
sc = spark.sparkContext
spark.createDataFrame(
    people_id_list,
    ['name', 'age']
).collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

In [11]:
rdd = sc.parallelize(range(10))
rdd2 = rdd.map(lambda x : (str(x),x))
spark.createDataFrame(
    rdd2,
    ['name', 'age']).\
collect()

[Row(name='0', age=0),
 Row(name='1', age=1),
 Row(name='2', age=2),
 Row(name='3', age=3),
 Row(name='4', age=4),
 Row(name='5', age=5),
 Row(name='6', age=6),
 Row(name='7', age=7),
 Row(name='8', age=8),
 Row(name='9', age=9)]

### ROW (pyspark.sql.types.Row)

In [12]:
from pyspark.sql import Row

In [13]:
Person = Row('name', 'age')
print(Person)
print(type(Person))

<Row(name, age)>
<class 'pyspark.sql.types.Row'>


In [14]:
people_id_rdd = sc.parallelize(people_id_list)
people_id_rdd.collect()

[('Alice', 1), ('Bob', 2), ('Cindy', 3)]

In [15]:
people_rdd = people_id_rdd.map(lambda r: Person(*r))
print(type(people_rdd))

<class 'pyspark.rdd.PipelinedRDD'>


In [16]:
people_rdd.collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

In [17]:
people = spark.createDataFrame(people_id_rdd)
people.collect()

[Row(_1='Alice', _2=1), Row(_1='Bob', _2=2), Row(_1='Cindy', _2=3)]

### StructType & Schema

In [18]:
from pyspark.sql.types import *

In [19]:
person_schema = StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True)
])

In [20]:
df3 = spark.createDataFrame(people_id_rdd, person_schema)

In [21]:
df3.collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

### Pandas Dataframe

In [22]:
spark.createDataFrame(df_people.toPandas()).collect()

[Row(name='Alice', age=1), Row(name='Bob', age=2), Row(name='Cindy', age=3)]

In [23]:
import pandas
spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()

[Row(0=1, 1=2)]

### Schema by string

In [25]:
spark.createDataFrame(people_id_rdd, "a: string, b: int").collect()

[Row(a='Alice', b=1), Row(a='Bob', b=2), Row(a='Cindy', b=3)]

In [30]:
id_rdd = people_id_rdd.map(lambda row: row[1])

In [31]:
spark.createDataFrame(id_rdd, "int").collect()

[Row(value=1), Row(value=2), Row(value=3)]

In [32]:
true_false_rdd = sc.parallelize([True,False,False,True])

In [38]:
spark.createDataFrame(true_false_rdd, "boolean").collect()

[Row(value=True), Row(value=False), Row(value=False), Row(value=True)]

In [39]:
spark.createDataFrame(true_false_rdd, "string").collect()

[Row(value='true'), Row(value='false'), Row(value='false'), Row(value='true')]

## References

  - <https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.SparkSession>