# Spark DataFrame
## Parallelization

In [1]:
import findspark
findspark.init('C:/spark')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('prl').getOrCreate()
sc = spark.sparkContext

### Parallelize list

In [4]:
my_list = [1, 2, 3, 4, 5]

In [5]:
parallelized = sc.parallelize(my_list)

In [6]:
from_parallelized = parallelized.collect()

In [7]:
from_parallelized

[1, 2, 3, 4, 5]

### Parallelize .txt file

In [18]:
parallelized_txt = sc.textFile('../data/sample.txt')

In [19]:
from_ptxt = parallelized_txt.collect()

In [20]:
from_ptxt

['sample', 'text', 'lines']

#### Parallelize .txt with partitions

In [27]:
parallelized_txt.getNumPartitions()

3

In [38]:
ptxt_parted = sc.textFile('../data/sample.txt', minPartitions=5)

In [30]:
parted_collected = ptxt_parted.collect()

### Map and Collect

In [31]:
numRDD = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [33]:
cubedRDD = numRDD.map(lambda x: x ** 3)

In [34]:
numbers_all = cubedRDD.collect()

In [35]:
numbers_all

[1, 8, 27, 64, 125, 216, 343, 512, 729, 1000]

### Filter and Count

In [49]:
# Filter "suscipit"
fileRDD = ptxt_parted.filter(lambda line: 'suscipit' in line.split())

In [50]:
fileRDD.count()

26

In [51]:
filtered_collected = fileRDD.collect()

### Pairs

#### Reduce by key

In [52]:
rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,5)])

In [53]:
rdd_reduced = rdd.reduceByKey(lambda x, y: x + y)

In [54]:
for num in rdd_reduced.collect():
    print("Key {} has {} Counts".format(num[0], num[1]))

Key 1 has 2 Counts
Key 3 has 10 Counts
Key 4 has 5 Counts


#### Sort by key

In [56]:
rdd_reduced_sort = rdd_reduced.sortByKey(ascending=False)

In [57]:
for num in rdd_reduced_sort.collect():
  print("Key {} has {} Counts".format(num[0], num[1]))

Key 4 has 5 Counts
Key 3 has 10 Counts
Key 1 has 2 Counts
