In [5]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

# if __name__ == "__main__":
    # spark = SparkSesion.builder.appName("MyApp").getOrCreate()
SpSession = SparkSession \
    .builder \
    .appName("Demo Spark") \
    .getOrCreate()

sparkContext = SpSession.sparkContext # clusters


# Upload data

In [8]:
data = SpSession.read.csv("cars.csv", header=True, sep=";")
data.show(6)

+--------------------+------+---------+------------+----------+------+------------+-----+------+
|                 Car|   MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+------+---------+------------+----------+------+------------+-----+------+
|              STRING|DOUBLE|      INT|      DOUBLE|    DOUBLE|DOUBLE|      DOUBLE|  INT|   CAT|
|Chevrolet Chevell...|  18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|
|   Buick Skylark 320|  15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|
|  Plymouth Satellite|  18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|
|       AMC Rebel SST|  16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|
|         Ford Torino|  17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|
+--------------------+------+---------+------------+----------+------+------------+-----+------+
only showing top 6 rows



# Knowing data

In [9]:
data.printSchema() # desc table

root
 |-- Car: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- Cylinders: string (nullable = true)
 |-- Displacement: string (nullable = true)
 |-- Horsepower: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- Acceleration: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Origin: string (nullable = true)



In [10]:
data.columns # columns

['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [12]:
data.dtypes # data type

[('Car', 'string'),
 ('MPG', 'string'),
 ('Cylinders', 'string'),
 ('Displacement', 'string'),
 ('Horsepower', 'string'),
 ('Weight', 'string'),
 ('Acceleration', 'string'),
 ('Model', 'string'),
 ('Origin', 'string')]

# Selecting columns

In [13]:
# method 1
data.select(data.Car).show()
# data.select(data.Car).show() truncate=False - view all info per row

+--------------------+
|                 Car|
+--------------------+
|              STRING|
|Chevrolet Chevell...|
|   Buick Skylark 320|
|  Plymouth Satellite|
|       AMC Rebel SST|
|         Ford Torino|
|    Ford Galaxie 500|
|    Chevrolet Impala|
|   Plymouth Fury iii|
|    Pontiac Catalina|
|  AMC Ambassador DPL|
|Citroen DS-21 Pallas|
|Chevrolet Chevell...|
|    Ford Torino (sw)|
|Plymouth Satellit...|
|  AMC Rebel SST (sw)|
| Dodge Challenger SE|
|  Plymouth 'Cuda 340|
|Ford Mustang Boss...|
|Chevrolet Monte C...|
+--------------------+
only showing top 20 rows



In [15]:
# method 2
from os import truncate
data.select(data["Car"]).show(truncate=False)

+--------------------------------+
|Car                             |
+--------------------------------+
|STRING                          |
|Chevrolet Chevelle Malibu       |
|Buick Skylark 320               |
|Plymouth Satellite              |
|AMC Rebel SST                   |
|Ford Torino                     |
|Ford Galaxie 500                |
|Chevrolet Impala                |
|Plymouth Fury iii               |
|Pontiac Catalina                |
|AMC Ambassador DPL              |
|Citroen DS-21 Pallas            |
|Chevrolet Chevelle Concours (sw)|
|Ford Torino (sw)                |
|Plymouth Satellite (sw)         |
|AMC Rebel SST (sw)              |
|Dodge Challenger SE             |
|Plymouth 'Cuda 340              |
|Ford Mustang Boss 302           |
|Chevrolet Monte Carlo           |
+--------------------------------+
only showing top 20 rows



In [16]:
# method 3
from pyspark.sql.functions import col
data.select(col("Car")).show(truncate=False)

+--------------------------------+
|Car                             |
+--------------------------------+
|STRING                          |
|Chevrolet Chevelle Malibu       |
|Buick Skylark 320               |
|Plymouth Satellite              |
|AMC Rebel SST                   |
|Ford Torino                     |
|Ford Galaxie 500                |
|Chevrolet Impala                |
|Plymouth Fury iii               |
|Pontiac Catalina                |
|AMC Ambassador DPL              |
|Citroen DS-21 Pallas            |
|Chevrolet Chevelle Concours (sw)|
|Ford Torino (sw)                |
|Plymouth Satellite (sw)         |
|AMC Rebel SST (sw)              |
|Dodge Challenger SE             |
|Plymouth 'Cuda 340              |
|Ford Mustang Boss 302           |
|Chevrolet Monte Carlo           |
+--------------------------------+
only showing top 20 rows



### Multiple Selection

In [17]:
# 1
data.select(data.Car, data.Cylinders).show(truncate=False)

+--------------------------------+---------+
|Car                             |Cylinders|
+--------------------------------+---------+
|STRING                          |INT      |
|Chevrolet Chevelle Malibu       |8        |
|Buick Skylark 320               |8        |
|Plymouth Satellite              |8        |
|AMC Rebel SST                   |8        |
|Ford Torino                     |8        |
|Ford Galaxie 500                |8        |
|Chevrolet Impala                |8        |
|Plymouth Fury iii               |8        |
|Pontiac Catalina                |8        |
|AMC Ambassador DPL              |8        |
|Citroen DS-21 Pallas            |4        |
|Chevrolet Chevelle Concours (sw)|8        |
|Ford Torino (sw)                |8        |
|Plymouth Satellite (sw)         |8        |
|AMC Rebel SST (sw)              |8        |
|Dodge Challenger SE             |8        |
|Plymouth 'Cuda 340              |8        |
|Ford Mustang Boss 302           |8        |
|Chevrolet

In [21]:
# 2
data.select(data["Car"], data["Cylinders"]).show(truncate=False) # No upper case sensitive

+--------------------------------+---------+
|Car                             |Cylinders|
+--------------------------------+---------+
|STRING                          |INT      |
|Chevrolet Chevelle Malibu       |8        |
|Buick Skylark 320               |8        |
|Plymouth Satellite              |8        |
|AMC Rebel SST                   |8        |
|Ford Torino                     |8        |
|Ford Galaxie 500                |8        |
|Chevrolet Impala                |8        |
|Plymouth Fury iii               |8        |
|Pontiac Catalina                |8        |
|AMC Ambassador DPL              |8        |
|Citroen DS-21 Pallas            |4        |
|Chevrolet Chevelle Concours (sw)|8        |
|Ford Torino (sw)                |8        |
|Plymouth Satellite (sw)         |8        |
|AMC Rebel SST (sw)              |8        |
|Dodge Challenger SE             |8        |
|Plymouth 'Cuda 340              |8        |
|Ford Mustang Boss 302           |8        |
|Chevrolet

In [23]:
data.select(col("Car"),col("Cylinders")).show(truncate=False)

+--------------------------------+---------+
|Car                             |Cylinders|
+--------------------------------+---------+
|STRING                          |INT      |
|Chevrolet Chevelle Malibu       |8        |
|Buick Skylark 320               |8        |
|Plymouth Satellite              |8        |
|AMC Rebel SST                   |8        |
|Ford Torino                     |8        |
|Ford Galaxie 500                |8        |
|Chevrolet Impala                |8        |
|Plymouth Fury iii               |8        |
|Pontiac Catalina                |8        |
|AMC Ambassador DPL              |8        |
|Citroen DS-21 Pallas            |4        |
|Chevrolet Chevelle Concours (sw)|8        |
|Ford Torino (sw)                |8        |
|Plymouth Satellite (sw)         |8        |
|AMC Rebel SST (sw)              |8        |
|Dodge Challenger SE             |8        |
|Plymouth 'Cuda 340              |8        |
|Ford Mustang Boss 302           |8        |
|Chevrolet

# Add columns

In [26]:
# 1
from pyspark.sql.functions import lit

df = data.withColumn("First_Column", lit(1))
df.show(6, truncate=False)

+-------------------------+------+---------+------------+----------+------+------------+-----+------+------------+
|Car                      |MPG   |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|First_Column|
+-------------------------+------+---------+------------+----------+------+------------+-----+------+------------+
|STRING                   |DOUBLE|INT      |DOUBLE      |DOUBLE    |DOUBLE|DOUBLE      |INT  |CAT   |1           |
|Chevrolet Chevelle Malibu|18.0  |8        |307.0       |130.0     |3504. |12.0        |70   |US    |1           |
|Buick Skylark 320        |15.0  |8        |350.0       |165.0     |3693. |11.5        |70   |US    |1           |
|Plymouth Satellite       |18.0  |8        |318.0       |150.0     |3436. |11.0        |70   |US    |1           |
|AMC Rebel SST            |16.0  |8        |304.0       |150.0     |3433. |12.0        |70   |US    |1           |
|Ford Torino              |17.0  |8        |302.0       |140.0     |3449. |10.5 

In [27]:
# 2
df = data.withColumn("Second_Column", lit(2)) \
    .withColumn("Third_Column", lit("si venga hombre"))
df.show(6, truncate=False)

+-------------------------+------+---------+------------+----------+------+------------+-----+------+-------------+---------------+
|Car                      |MPG   |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|Second_Column|Third_Column   |
+-------------------------+------+---------+------------+----------+------+------------+-----+------+-------------+---------------+
|STRING                   |DOUBLE|INT      |DOUBLE      |DOUBLE    |DOUBLE|DOUBLE      |INT  |CAT   |2            |si venga hombre|
|Chevrolet Chevelle Malibu|18.0  |8        |307.0       |130.0     |3504. |12.0        |70   |US    |2            |si venga hombre|
|Buick Skylark 320        |15.0  |8        |350.0       |165.0     |3693. |11.5        |70   |US    |2            |si venga hombre|
|Plymouth Satellite       |18.0  |8        |318.0       |150.0     |3436. |11.0        |70   |US    |2            |si venga hombre|
|AMC Rebel SST            |16.0  |8        |304.0       |150.0     |3433. |1

# Grouping

In [28]:
df.groupBy("Origin").count().show()

+------+-----+
|Origin|count|
+------+-----+
|Europe|   73|
|    US|  254|
|   CAT|    1|
| Japan|   79|
+------+-----+



In [29]:
df.groupBy("Horsepower").count().show()

+----------+-----+
|Horsepower|count|
+----------+-----+
|     102.0|    1|
|     68.00|    6|
|     116.0|    1|
|     145.0|    7|
|     90.00|   20|
|     87.00|    2|
|     170.0|    5|
|     75.00|   14|
|     132.0|    1|
|     84.00|    6|
|     88.00|   19|
|     200.0|    1|
|     152.0|    1|
|         0|    6|
|     120.0|    4|
|     80.00|    7|
|     108.0|    1|
|     58.00|    2|
|     210.0|    1|
|     96.00|    3|
+----------+-----+
only showing top 20 rows



In [30]:
df.groupBy("Origin","Horsepower").count().show(6)

+------+----------+-----+
|Origin|Horsepower|count|
+------+----------+-----+
|Europe|     46.00|    2|
| Japan|     67.00|    8|
|    US|     198.0|    2|
|    US|     75.00|    3|
|    US|     60.00|    1|
| Japan|     97.00|    7|
+------+----------+-----+
only showing top 6 rows



# Deleting Columns

In [31]:
df.show()

+--------------------+------+---------+------------+----------+------+------------+-----+------+-------------+---------------+
|                 Car|   MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|Second_Column|   Third_Column|
+--------------------+------+---------+------------+----------+------+------------+-----+------+-------------+---------------+
|              STRING|DOUBLE|      INT|      DOUBLE|    DOUBLE|DOUBLE|      DOUBLE|  INT|   CAT|            2|si venga hombre|
|Chevrolet Chevell...|  18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|            2|si venga hombre|
|   Buick Skylark 320|  15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|            2|si venga hombre|
|  Plymouth Satellite|  18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|            2|si venga hombre|
|       AMC Rebel SST|  16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|            2|s

In [32]:
df = df.drop("Second_Column") #df = df.drop("Second_Column", "Third_Column")
df.show()

+--------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|                 Car|   MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|   Third_Column|
+--------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|              STRING|DOUBLE|      INT|      DOUBLE|    DOUBLE|DOUBLE|      DOUBLE|  INT|   CAT|si venga hombre|
|Chevrolet Chevell...|  18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|si venga hombre|
|   Buick Skylark 320|  15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|si venga hombre|
|  Plymouth Satellite|  18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|si venga hombre|
|       AMC Rebel SST|  16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|si venga hombre|
|         Ford Torino|  17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|

# Order

In [37]:
df.orderBy("Cylinders", ascending=False).show(6, truncate=False)

+-------------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|Car                      |MPG   |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|Third_Column   |
+-------------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|STRING                   |DOUBLE|INT      |DOUBLE      |DOUBLE    |DOUBLE|DOUBLE      |INT  |CAT   |si venga hombre|
|Chevrolet Chevelle Malibu|18.0  |8        |307.0       |130.0     |3504. |12.0        |70   |US    |si venga hombre|
|Buick Skylark 320        |15.0  |8        |350.0       |165.0     |3693. |11.5        |70   |US    |si venga hombre|
|Plymouth Satellite       |18.0  |8        |318.0       |150.0     |3436. |11.0        |70   |US    |si venga hombre|
|AMC Rebel SST            |16.0  |8        |304.0       |150.0     |3433. |12.0        |70   |US    |si venga hombre|
|Ford Torino              |17.0  |8        |302.0       

In [39]:
df.groupBy("Origin").count().orderBy("count", ascending=False).show(9)

+------+-----+
|Origin|count|
+------+-----+
|    US|  254|
| Japan|   79|
|Europe|   73|
|   CAT|    1|
+------+-----+



# Filtering

In [40]:
df.show()

+--------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|                 Car|   MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|   Third_Column|
+--------------------+------+---------+------------+----------+------+------------+-----+------+---------------+
|              STRING|DOUBLE|      INT|      DOUBLE|    DOUBLE|DOUBLE|      DOUBLE|  INT|   CAT|si venga hombre|
|Chevrolet Chevell...|  18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|si venga hombre|
|   Buick Skylark 320|  15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|si venga hombre|
|  Plymouth Satellite|  18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|si venga hombre|
|       AMC Rebel SST|  16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|si venga hombre|
|         Ford Torino|  17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|

In [41]:
total_count = df.count()
total_count

407

In [45]:
df.filter(df["Origin"] == "Europe").count()
df.filter(df["Origin"] == "Europe").show()

+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|   Third_Column|
+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|Citroen DS-21 Pallas|   0|        4|       133.0|     115.0| 3090.|        17.5|   70|Europe|si venga hombre|
|Volkswagen 1131 D...|26.0|        4|       97.00|     46.00| 1835.|        20.5|   70|Europe|si venga hombre|
|         Peugeot 504|25.0|        4|       110.0|     87.00| 2672.|        17.5|   70|Europe|si venga hombre|
|         Audi 100 LS|24.0|        4|       107.0|     90.00| 2430.|        14.5|   70|Europe|si venga hombre|
|            Saab 99e|25.0|        4|       104.0|     95.00| 2375.|        17.5|   70|Europe|si venga hombre|
|            BMW 2002|26.0|        4|       121.0|     113.0| 2234.|        12.5|   70|Europe|si venga hombre|
|

In [47]:
df.filter(col("Origin")=="US").show()

+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|   Third_Column|
+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0| 3504.|        12.0|   70|    US|si venga hombre|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0| 3693.|        11.5|   70|    US|si venga hombre|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|si venga hombre|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|si venga hombre|
|         Ford Torino|17.0|        8|       302.0|     140.0| 3449.|        10.5|   70|    US|si venga hombre|
|    Ford Galaxie 500|15.0|        8|       429.0|     198.0| 4341.|        10.0|   70|    US|si venga hombre|
|

In [54]:
df.filter((df["Horsepower"]==150.0) &(df["Origin"]=="US")).show()

+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|   Third_Column|
+--------------------+----+---------+------------+----------+------+------------+-----+------+---------------+
|  Plymouth Satellite|18.0|        8|       318.0|     150.0| 3436.|        11.0|   70|    US|si venga hombre|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0| 3433.|        12.0|   70|    US|si venga hombre|
|Chevrolet Monte C...|15.0|        8|       400.0|     150.0| 3761.|         9.5|   70|    US|si venga hombre|
|   Plymouth Fury iii|14.0|        8|       318.0|     150.0| 4096.|        13.0|   71|    US|si venga hombre|
|   Plymouth Fury III|15.0|        8|       318.0|     150.0| 4135.|        13.5|   72|    US|si venga hombre|
|  AMC Ambassador SST|17.0|        8|       304.0|     150.0| 3672.|        11.5|   72|    US|si venga hombre|
|