### Installing Library

In [4]:
#!pip install pyspark 

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext()
spark = SparkSession(sparkContext = sc)

### RDD Object
The class pyspark.SpartContext creates a client which connect to a Spark Cluster. This client can be used to create an RDD object. We can implement 2 methods from this class for directly creating RDD objects: parallelize() and textFile()


#### parallelize()

In [2]:
rdd = sc.parallelize([1, 2, 3])
rdd.collect()

[1, 2, 3]

In [3]:
list_tuple = [('megalodon', 'hammerhead', 'bullshark'), ('panther', 'leopard', 'tiger')]
rdd = sc.parallelize(list_tuple)
rdd.collect()

[('megalodon', 'hammerhead', 'bullshark'), ('panther', 'leopard', 'tiger')]

In [4]:
diction = {
            'A': 100, 
            'B': 150, 
            'C': 200 
}
rdd = sc.parallelize(diction)
rdd.collect()

['A', 'B', 'C']

#### textFile()

In [5]:
rdd = sc.textFile('sample_data.csv')
rdd.take(5)

['Toothpaste,Colgate,Oral Care,2.99,150,2025-06-30',
 'Paper Towels,Bounty,Household Essentials,6.49,100,2024-12-31',
 'Shampoo,Pantene,Personal Care,4.79,200,2025-09-15',
 'Laundry Detergent,Tide,Household Essentials,9.99,120,2025-03-31',
 'Potato Chips,Lays,Snacks,3.49,80,2024-10-15']

### Dataframe Objects

#### Reading a csv using spark.read.csv()

In [6]:
bankchurner_data = spark.read.csv(path = 'BankChurners.csv', 
                                  sep = ',', 
                                  encoding = 'UTF-8', 
                                  comment = None, 
                                  header = True, 
                                  inferSchema = True)

In [7]:
bankchurner_data.show(n = 5, truncate = False)

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|CLIENTNUM|Attrition_Flag   |Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|Naive_Bayes_Classifier_Attrit

#### Creating a Spark dataframe with createDataFrame()

#### from an RDD

In [8]:
from pyspark.sql import Row
rdd = sc.parallelize([
    Row(x = [1, 2, 3], y = ['P', 'Q', 'R']), 
    Row(x = [4, 5, 6], y = ['U', 'V', 'W']), 
    Row(x = [7, 8, 9], y = ['X', 'Y', 'Z']), 
])
rdd.collect()

[Row(x=[1, 2, 3], y=['P', 'Q', 'R']),
 Row(x=[4, 5, 6], y=['U', 'V', 'W']),
 Row(x=[7, 8, 9], y=['X', 'Y', 'Z'])]

In [9]:
df = spark.createDataFrame(rdd)
df.show()

+---------+---------+
|        x|        y|
+---------+---------+
|[1, 2, 3]|[P, Q, R]|
|[4, 5, 6]|[U, V, W]|
|[7, 8, 9]|[X, Y, Z]|
+---------+---------+



In [10]:
import pandas as pd 
sam_df = pd.DataFrame({
    'x': [[1, 2, 3], [4, 5, 6], [7, 8, 9]], 
    'y': [['P', 'Q', 'R'], ['U', 'V', 'W'], ['X', 'Y', 'Z']]
})
sam_df

Unnamed: 0,x,y
0,"[1, 2, 3]","[P, Q, R]"
1,"[4, 5, 6]","[U, V, W]"
2,"[7, 8, 9]","[X, Y, Z]"


In [11]:
df = spark.createDataFrame(sam_df)
df.show()

+---------+---------+
|        x|        y|
+---------+---------+
|[1, 2, 3]|[P, Q, R]|
|[4, 5, 6]|[U, V, W]|
|[7, 8, 9]|[X, Y, Z]|
+---------+---------+



In [12]:
df.dtypes

[('x', 'array<bigint>'), ('y', 'array<string>')]

#### Spark dataframe to RDD 

#### pyspark.sql.Dataframe.rdd() is used to convert Spark dataframe to RDD

In [13]:
bankchurner_data.show(5)

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|Naive_Bayes_Classifier_Attrit

In [14]:
bankchurner_data.rdd.take(5)

[Row(CLIENTNUM=768805383, Attrition_Flag='Existing Customer', Customer_Age=45, Gender='M', Dependent_count=3, Education_Level='High School', Marital_Status='Married', Income_Category='$60K - $80K', Card_Category='Blue', Months_on_book=39, Total_Relationship_Count=5, Months_Inactive_12_mon=1, Contacts_Count_12_mon=3, Credit_Limit=12691.0, Total_Revolving_Bal=777, Avg_Open_To_Buy=11914.0, Total_Amt_Chng_Q4_Q1=1.335, Total_Trans_Amt=1144, Total_Trans_Ct=42, Total_Ct_Chng_Q4_Q1=1.625, Avg_Utilization_Ratio=0.061, Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1=9.3448e-05, Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2=0.99991),
 Row(CLIENTNUM=818770008, Attrition_Flag='Existing Customer', Customer_Age=49, Gender='F', Dependent_count=5, Education_Level='Graduate', Marital_Status='Single', Income_Category='Less than $40K', Card

#### With an RDD object, we can apply mapping functions - map, mapValues, flatMap, flatMapValues 

In [15]:
bankchurner_data_map = bankchurner_data.rdd.map(lambda x: (x['Gender'], x['Marital_Status'], x['Credit_Limit']))
bankchurner_data_map.take(5)

[('M', 'Married', 12691.0),
 ('F', 'Single', 8256.0),
 ('M', 'Married', 3418.0),
 ('F', 'Unknown', 3313.0),
 ('M', 'Married', 4716.0)]

In [23]:
map_bankchurner_data = bankchurner_data.rdd.map(lambda x: (x[0], x[13]))
map_bankchurner_data.take(5)

[(768805383, 12691.0),
 (818770008, 8256.0),
 (713982108, 3418.0),
 (769911858, 3313.0),
 (709106358, 4716.0)]

In [24]:
bankchurner_data2 = sc.textFile('BankChurners.csv')
bankchurner_data2.take(5)

['"CLIENTNUM","Attrition_Flag","Customer_Age","Gender","Dependent_count","Education_Level","Marital_Status","Income_Category","Card_Category","Months_on_book","Total_Relationship_Count","Months_Inactive_12_mon","Contacts_Count_12_mon","Credit_Limit","Total_Revolving_Bal","Avg_Open_To_Buy","Total_Amt_Chng_Q4_Q1","Total_Trans_Amt","Total_Trans_Ct","Total_Ct_Chng_Q4_Q1","Avg_Utilization_Ratio","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"',
 '768805383,"Existing Customer",45,"M",3,"High School","Married","$60K - $80K","Blue",39,5,1,3,12691,777,11914,1.335,1144,42,1.625,0.061,9.3448e-05,0.99991',
 '818770008,"Existing Customer",49,"F",5,"Graduate","Single","Less than $40K","Blue",44,6,1,2,8256,864,7392,1.541,1291,33,3.714,0.105,5.6861e-05,0.99994',
 '713982108,"Existing Cus

In [26]:
map_bankchurner_data2_rdd_1 = bankchurner_data2.map(lambda x: x.split(',')).map(lambda x: (x[0], x[1:]))
map_bankchurner_data2_rdd_1.take(5)

[('"CLIENTNUM"',
  ['"Attrition_Flag"',
   '"Customer_Age"',
   '"Gender"',
   '"Dependent_count"',
   '"Education_Level"',
   '"Marital_Status"',
   '"Income_Category"',
   '"Card_Category"',
   '"Months_on_book"',
   '"Total_Relationship_Count"',
   '"Months_Inactive_12_mon"',
   '"Contacts_Count_12_mon"',
   '"Credit_Limit"',
   '"Total_Revolving_Bal"',
   '"Avg_Open_To_Buy"',
   '"Total_Amt_Chng_Q4_Q1"',
   '"Total_Trans_Amt"',
   '"Total_Trans_Ct"',
   '"Total_Ct_Chng_Q4_Q1"',
   '"Avg_Utilization_Ratio"',
   '"Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"',
   '"Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"']),
 ('768805383',
  ['"Existing Customer"',
   '45',
   '"M"',
   '3',
   '"High School"',
   '"Married"',
   '"$60K - $80K"',
   '"Blue"',
   '39',
   '5',
   '1',
   '3',
   '12691',
   '777',
   '11914',

In [28]:
# removing the header row
header = map_bankchurner_data2_rdd_1.first()
# filtering rows not equal to header values
map_bankchurner_data2_rdd_2 = map_bankchurner_data2_rdd_1.filter(lambda x: x != header)
map_bankchurner_data2_rdd_2.take(5)

[('768805383',
  ['"Existing Customer"',
   '45',
   '"M"',
   '3',
   '"High School"',
   '"Married"',
   '"$60K - $80K"',
   '"Blue"',
   '39',
   '5',
   '1',
   '3',
   '12691',
   '777',
   '11914',
   '1.335',
   '1144',
   '42',
   '1.625',
   '0.061',
   '9.3448e-05',
   '0.99991']),
 ('818770008',
  ['"Existing Customer"',
   '49',
   '"F"',
   '5',
   '"Graduate"',
   '"Single"',
   '"Less than $40K"',
   '"Blue"',
   '44',
   '6',
   '1',
   '2',
   '8256',
   '864',
   '7392',
   '1.541',
   '1291',
   '33',
   '3.714',
   '0.105',
   '5.6861e-05',
   '0.99994']),
 ('713982108',
  ['"Existing Customer"',
   '51',
   '"M"',
   '3',
   '"Graduate"',
   '"Married"',
   '"$80K - $120K"',
   '"Blue"',
   '36',
   '4',
   '1',
   '0',
   '3418',
   '0',
   '3418',
   '2.594',
   '1887',
   '20',
   '2.333',
   '0',
   '2.1081e-05',
   '0.99998']),
 ('769911858',
  ['"Existing Customer"',
   '40',
   '"F"',
   '4',
   '"High School"',
   '"Unknown"',
   '"Less than $40K"',
   '"Bl

In [42]:
map_bankchurner_data2_rdd_3 = map_bankchurner_data2_rdd_2.filter(lambda x: len(x))
map_bankchurner_data2_rdd_3.take(5)

[('768805383',
  ['"Existing Customer"',
   '45',
   '"M"',
   '3',
   '"High School"',
   '"Married"',
   '"$60K - $80K"',
   '"Blue"',
   '39',
   '5',
   '1',
   '3',
   '12691',
   '777',
   '11914',
   '1.335',
   '1144',
   '42',
   '1.625',
   '0.061',
   '9.3448e-05',
   '0.99991']),
 ('818770008',
  ['"Existing Customer"',
   '49',
   '"F"',
   '5',
   '"Graduate"',
   '"Single"',
   '"Less than $40K"',
   '"Blue"',
   '44',
   '6',
   '1',
   '2',
   '8256',
   '864',
   '7392',
   '1.541',
   '1291',
   '33',
   '3.714',
   '0.105',
   '5.6861e-05',
   '0.99994']),
 ('713982108',
  ['"Existing Customer"',
   '51',
   '"M"',
   '3',
   '"Graduate"',
   '"Married"',
   '"$80K - $120K"',
   '"Blue"',
   '36',
   '4',
   '1',
   '0',
   '3418',
   '0',
   '3418',
   '2.594',
   '1887',
   '20',
   '2.333',
   '0',
   '2.1081e-05',
   '0.99998']),
 ('769911858',
  ['"Existing Customer"',
   '40',
   '"F"',
   '4',
   '"High School"',
   '"Unknown"',
   '"Less than $40K"',
   '"Bl

#### flatMap function

In [38]:
x = [('a', 'b', 'c'), ('a', 'a'), ('c', 'c', 'c', 'd', 'a')]
flatmapt_exp_rdd = sc.parallelize(x)
flatmapt_exp_rdd.collect()

[('a', 'b', 'c'), ('a', 'a'), ('c', 'c', 'c', 'd', 'a')]

In [39]:
flatmapt_exp_rdd_1 = flatmapt_exp_rdd.flatMap(lambda x: x)
flatmapt_exp_rdd_1.collect()

['a', 'b', 'c', 'a', 'a', 'c', 'c', 'c', 'd', 'a']

#### flatMapValues function

In [44]:
data = [[1, (105, 107, 109)], 
        [2, (205, 207, 209)], 
        [3, (305, 307, 309)]]

fmv_rdd_1 = sc.parallelize(data)
fmv_rdd_1.collect()

[[1, (105, 107, 109)], [2, (205, 207, 209)], [3, (305, 307, 309)]]

In [45]:
fmv_rdd_2 = fmv_rdd_1.flatMapValues(lambda x: list(zip(list('ABC'), x)))
fmv_rdd_2.collect()

[(1, ('A', 105)),
 (1, ('B', 107)),
 (1, ('C', 109)),
 (2, ('A', 205)),
 (2, ('B', 207)),
 (2, ('C', 209)),
 (3, ('A', 305)),
 (3, ('B', 307)),
 (3, ('C', 309))]

In [46]:
# unpacking the element values
fmv_rdd_3 = fmv_rdd_2.map(lambda x: [x[0]] + list(x[1]))
fmv_rdd_3.collect()

[[1, 'A', 105],
 [1, 'B', 107],
 [1, 'C', 109],
 [2, 'A', 205],
 [2, 'B', 207],
 [2, 'C', 209],
 [3, 'A', 305],
 [3, 'B', 307],
 [3, 'C', 309]]