# SparkContext and RDD DataFrame 
SparkSession (Method 3) is used for creating SparckContext

In [1]:
import findspark 
findspark.init() 

In [2]:
from pyspark.sql import SparkSession 
from pyspark.conf import SparkConf
from pyspark import SparkContext

### Method 1: Creating SparkContext (SparkSession and SparkConf)

In [3]:
conf = SparkConf()\
.setMaster("local[4]")\
.setAppName("Creating-RDD")\
.setExecutorEnv("spark.executor.memory","4g")\
.setExecutorEnv("spark.driver.memory","4g")\

pyspark = SparkSession.builder\
.config(conf=conf)\
.getOrCreate()

#### SparkContext is created

In [4]:
sc = pyspark.sparkContext
sc.stop()

### Method 2: Creating SparkContext (SparkContext and SparkConf)

In [5]:
sparkConf = SparkConf()\
.setMaster("local[4]")\
.setAppName("Creating-RDD")\
.setExecutorEnv("spark.executor.memory","4g")\
.setExecutorEnv("spark.driver.memory","4g")

In [6]:
sc = SparkContext(conf=sparkConf)
sc.stop()

### Method 3: Creating SparkContext (SparkSession)

####  Here is created a Session for creating SparkContext
<b>spark.executer.memory</b> is which uses for executing  (4 gb ram allocated) <br>
<b>spark.driver.memory</b> is which uses for user send and receive result  (2 gb ram allocated)

In [7]:
pyspark = SparkSession.builder \
.master("local[4]")\
.appName("Creating-RDD")\
.config("spark.executer.memory","4g")\
.config("spark.driver.memory","4g")\
.getOrCreate()

In [8]:
sc = pyspark.sparkContext

### RDD Creating (Python List and Tuple)
#### RDD is creating for testing SparkContext (Tuple type)

In [9]:
rdd1 = sc.parallelize([("Milk", 25),("Bread",50),("Egg",15)])

#### Here we getting 2 element (It means Spark Action)

In [10]:
print(rdd1.take(2))
print("RDD-1 Count: ",rdd1.count()) 

[('Milk', 25), ('Bread', 50)]
RDD-1 Count:  3


#### RDD ise creating in List Type

In [11]:
rdd2 = sc.parallelize([["Milk", 25],["Bread",50],["Egg",15]])

In [12]:
print(rdd2.take(2))
print("RDD-2 Count: ",rdd2.count()) 

[['Milk', 25], ['Bread', 50]]
RDD-2 Count:  3


### RDD Creating (Dictionary)
#### Creating dictionary 

In [13]:
myDict = {
    "Products": ["Milk","Bread","Egg"],
    "Prices": [25,50,15]
}

#### Importing dictionary to Python Pandas DataFrame 

In [14]:
import pandas as pd 
df  = pd.DataFrame(myDict)
df.head()

Unnamed: 0,Products,Prices
0,Milk,25
1,Bread,50
2,Egg,15


#### Create Pyspark dataframe from Pandas Dataframe

In [15]:
rdd_from_pandasDF = pyspark.createDataFrame(df)

#### Show pyspark dataframe

In [16]:
rdd_from_pandasDF.show()

+--------+------+
|Products|Prices|
+--------+------+
|    Milk|    25|
|   Bread|    50|
|     Egg|    15|
+--------+------+



#### Creating RDD from dataframe

In [17]:
rdd_from_pandas = rdd_from_pandasDF.rdd

#### Show (action) created RDD datas

In [18]:
rdd_from_pandas.take(3)

[Row(Products='Milk', Prices=25),
 Row(Products='Bread', Prices=50),
 Row(Products='Egg', Prices=15)]

In [19]:
import pandas as pd 
df = pd.read_csv("data/RDD-dataset.csv")
df.head()

Unnamed: 0,Products,Prices
0,Milk,25
1,Bread,50
2,Egg,15


#### Creating PySpark dataframe from loaded csv file 

In [20]:
rdd_dataframe = pyspark.createDataFrame(df)
rdd_dataframe.show()

+--------+------+
|Products|Prices|
+--------+------+
|    Milk|    25|
|   Bread|    50|
|     Egg|    15|
+--------+------+



In [21]:
rdd_dataframe.take(3)

[Row(Products='Milk', Prices=25),
 Row(Products='Bread', Prices=50),
 Row(Products='Egg', Prices=15)]