### Spark Init

In [1]:
import findspark
findspark.init()

# Section 1: Create Spark Contexts with Different Methods

## Create a Spark Context Method - 1: Using Only Session


In [2]:
# All Libraries we need
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext


### Step 1: Create an Spark Session Directly

In [4]:
pyspark = SparkSession.builder \
.master("local[4]") \
.appName("Create a RDD") \
.config("spark.executor.memory", "4g") \
.config("spark.driver.memory", "2g") \
.getOrCreate()

### Step 2: Create spark context object for create RDDs (for connection to computing cluster)

In [5]:
sc = pyspark.sparkContext

#### Create an RDD for using example

In [6]:
rdd1 = sc.parallelize([("Ahmet", 25),("Berk", 18),("Mehmet", 28),("Batuhan", 20)])

#### Check RDD with action

In [7]:
rdd1.take(2)

                                                                                

[('Ahmet', 25), ('Berk', 18)]

#### We can stop spark session with stop method

In [8]:
sc.stop()

## Create a Spark Context Method - 2: Using Session and Conf

### Step 1: Create an Spark Configuration

In [9]:
conf = SparkConf() \
.setMaster("local[4]") \
.setAppName("Create a RDD") \
.setExecutorEnv("spark.executor.memory", "4g") \
.setExecutorEnv("spark.driver.memory", "2g")

### Step 2: Create an Spark Session

In [10]:
pyspark = SparkSession.builder \
.config(conf=conf) \
.getOrCreate()

### Step 3: Create spark context object for create RDDs (for connection to computing cluster)

In [11]:
sc = pyspark.sparkContext

#### Create an RDD for using example

In [12]:
rdd1 = sc.parallelize([("Ahmet", 25),("Berk", 18),("Mehmet", 28),("Batuhan", 20)])

#### Check RDD with action

In [13]:
rdd1.take(2)

[('Ahmet', 25), ('Berk', 18)]

#### We can stop spark session with stop method

In [14]:
sc.stop()

## Create a Spark Context Method - 3: Using Context and Conf

### Step 1: Create an Spark Configuration

In [15]:
conf = SparkConf() \
.setMaster("local[4]") \
.setAppName("Create a RDD") \
.setExecutorEnv("spark.executor.memory", "4g") \
.setExecutorEnv("spark.driver.memory", "2g")

### Step 2: Create spark context object for create RDDs (for connection to computing cluster)

In [16]:
sc = SparkContext(conf=conf)

#### Create an RDD as tuple for using example

In [17]:
rdd1 = sc.parallelize([("Ahmet", 25),("Berk", 18),("Mehmet", 28),("Batuhan", 20)])

#### Check RDD with action

In [18]:
rdd1.take(2)

[('Ahmet', 25), ('Berk', 18)]

#### Create an RDD as list for using example

In [19]:
rdd2 = sc.parallelize([["Ahmet", 25],["Berk", 18],["Mehmet", 28],["Batuhan", 20]])

#### Check RDD with action

In [20]:
rdd2.take(3)

[['Ahmet', 25], ['Berk', 18], ['Mehmet', 28]]

#### Create an RDD as dict for using example

In [23]:
import pandas as pd
my_dict = {"Sayilar": [1,2,3,4,5], "Harfler": ["a","b","c","d","e"]}

In [24]:
pd_df = pd.DataFrame(my_dict)
pd_df.head()

Unnamed: 0,Sayilar,Harfler
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e


In [26]:
rdd_from_pd_df = pyspark.createDataFrame(pd_df)
rdd_from_pd_df.show()

Py4JJavaError: An error occurred while calling o113.sessionState.
: java.lang.IllegalStateException: LiveListenerBus is stopped.
	at org.apache.spark.scheduler.LiveListenerBus.addToQueue(LiveListenerBus.scala:98)
	at org.apache.spark.scheduler.LiveListenerBus.addToStatusQueue(LiveListenerBus.scala:81)
	at org.apache.spark.sql.internal.SharedState.<init>(SharedState.scala:115)
	at org.apache.spark.sql.SparkSession.$anonfun$sharedState$1(SparkSession.scala:143)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sharedState$lzycompute(SparkSession.scala:143)
	at org.apache.spark.sql.SparkSession.sharedState(SparkSession.scala:142)
	at org.apache.spark.sql.SparkSession.$anonfun$sessionState$2(SparkSession.scala:162)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:160)
	at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:157)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


#### Create RDD From Text Files

In [29]:
file_name = "OnlineRetail.csv"
rdd_text_file = sc.textFile(file_name)

In [30]:
rdd_text_file.take(10)

['InvoiceNo;StockCode;Description;Quantity;InvoiceDate;UnitPrice;CustomerID;Country',
 '536365;85123A;WHITE HANGING HEART T-LIGHT HOLDER;6;1.12.2010 08:26;2,55;17850;United Kingdom',
 '536365;71053;WHITE METAL LANTERN;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;84406B;CREAM CUPID HEARTS COAT HANGER;8;1.12.2010 08:26;2,75;17850;United Kingdom',
 '536365;84029G;KNITTED UNION FLAG HOT WATER BOTTLE;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;84029E;RED WOOLLY HOTTIE WHITE HEART.;6;1.12.2010 08:26;3,39;17850;United Kingdom',
 '536365;22752;SET 7 BABUSHKA NESTING BOXES;2;1.12.2010 08:26;7,65;17850;United Kingdom',
 '536365;21730;GLASS STAR FROSTED T-LIGHT HOLDER;6;1.12.2010 08:26;4,25;17850;United Kingdom',
 '536366;22633;HAND WARMER UNION JACK;6;1.12.2010 08:28;1,85;17850;United Kingdom',
 '536366;22632;HAND WARMER RED POLKA DOT;6;1.12.2010 08:28;1,85;17850;United Kingdom']