In [1]:
# Import SparkSession

from pyspark.sql import SparkSession
import findspark
findspark.init()

In [2]:
# Create SparkSession 
spark = SparkSession.builder \
    .appName('SparkExample') \
        .config("spark.memory.offHeap.enabled","true") \
            .config("spark.memory.offHeap.size","10g").getOrCreate()

### RDD Parallelize

In [3]:
# Create RDD from parallelize    
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)
print(rdd.collect())

[('Java', 20000), ('Python', 100000), ('Scala', 3000)]


### Create dataframe with Schema

In [4]:
from pyspark.sql import Row
from pyspark.sql.types import *

rdd = spark.sparkContext.parallelize([
    Row(name='Allie', age=2),
    Row(name='Sara', age=33),
    Row(name='Grace', age=31)])

schema = schema = StructType([
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), False)])

df = spark.createDataFrame(rdd, schema)

df.show(truncate=False)

+-----+---+
|name |age|
+-----+---+
|Allie|2  |
|Sara |33 |
|Grace|31 |
+-----+---+



### Create dataframe with data and columns

In [5]:
data2 = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
d2 = spark.createDataFrame(data=data2, schema = columns)
d2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [6]:
# GroupByColumn
d2.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     M|    3|
|     F|    2|
+------+-----+

