In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.master("local[1]") \
            .appName("app") \
            .config("option", "value") \
            .getOrCreate()

25/03/16 20:53:09 WARN Utils: Your hostname, ChristoorossAir resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
25/03/16 20:53:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/16 20:53:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57786)
Traceback (most recent call last):
  File "/Users/chkapsalis/.pyenv/versions/3.10.15/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/chkapsalis/.pyenv/versions/3.10.15/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "

In [6]:
# Creating a SparkDF from sparkContext parallelized container

# 1st way to do this
rdd = spark.sparkContext.parallelize([1,3,5,7,9])
df = rdd.map(lambda x: (x, )).toDF()
df.printSchema()
df.show()

                                                                                

root
 |-- _1: long (nullable = true)

+---+
| _1|
+---+
|  1|
|  3|
|  5|
|  7|
|  9|
+---+



In [7]:
# 2nd way to do this

df1 = spark.createDataFrame([0,2,4,6,8], IntegerType())
df1.printSchema()
df1.show()

root
 |-- value: integer (nullable = true)

+-----+
|value|
+-----+
|    0|
|    2|
|    4|
|    6|
|    8|
+-----+



In [8]:
# improving the 1st way with also specifying the column names

rdd2 = spark.sparkContext.parallelize([
    ('John', 'Smith'),
    ('Maria', 'Jones'),
    ('Peter', 'Gabriel')
])

df2 = rdd2.toDF(['First Name', 'Last Name'])
df2.printSchema()
df2.show()

root
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)

+----------+---------+
|First Name|Last Name|
+----------+---------+
|      John|    Smith|
|     Maria|    Jones|
|     Peter|  Gabriel|
+----------+---------+



In [12]:
# 3rd way - reading from a file 
import os

# reading from a file without a header line
df = spark.read.csv('file:///' + os.getcwd() + '/salaries.csv')
df.printSchema()
df.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+-----+----+----+
|  _c0| _c1| _c2|
+-----+----+----+
|Peter|3000| 200|
|Helen|3100| 180|
|Maria|2900| 250|
| John|3600| 300|
+-----+----+----+



In [15]:
# If i wanted to manually assign the non-existent names
df = df.toDF("Name", "Monthly", "Bonus")
df.show()

+-----+-------+-----+
| Name|Monthly|Bonus|
+-----+-------+-----+
|Peter|   3000|  200|
|Helen|   3100|  180|
|Maria|   2900|  250|
| John|   3600|  300|
+-----+-------+-----+



In [14]:
# reading from a file with a header line
df1 = spark.read.csv('file:///' + os.getcwd() + '/salariesH.csv', header=True)
df1.printSchema()
df1.show()

root
 |-- Name: string (nullable = true)
 |-- Monthly: string (nullable = true)
 |-- Bonus: string (nullable = true)

+-----+-------+-----+
| Name|Monthly|Bonus|
+-----+-------+-----+
|Peter|   3000|  200|
|Helen|   3100|  180|
|Maria|   2900|  250|
| John|   3600|  300|
+-----+-------+-----+



In [16]:
# 4th way - reading from a json file 

df2 = spark.read.option("multiline", "true").json('file:///' + os.getcwd() + '/tweets.json')
df2.printSchema()
df2.show()

root
 |-- message: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- user: string (nullable = true)

+--------------------+-------------------+-----------+
|             message|                 ts|       user|
+--------------------+-------------------+-----------+
|        Today's news|2020-09-02T10:32:41|    anon-gr|
|Coronavirus sad u...|2020-09-01T22:42:04|generationx|
|   It is a sunny day|2020-08-31T03:22:11|    anon-gr|
|Back from summer ...|2020-08-29T20:57:44|generationx|
+--------------------+-------------------+-----------+



In [32]:
# 5th way - interacting with a database 
import mysql.connector  # module mysql-connector-python

In [33]:
conn = mysql.connector.connect(
    user='ckapsalis',
    password='ckapsalis',
    host='127.0.0.1'
);

cursor = conn.cursor()

In [34]:
# creating a db 

#WARNING: only one sql command per 'cursor.execute()'

cursor.execute("DROP DATABASE IF EXISTS suppliersProducts;")
cursor.execute("CREATE DATABASE suppliersProducts;")
cursor.execute("USE suppliersProducts;")
cursor.execute("""
    CREATE TABLE supplier (
        id CHAR(20) NOT NULL primary key,
        name CHAR(20),
        status INT,
        city CHAR(20)
    );
""")
cursor.execute("""
    CREATE TABLE product (
        id CHAR(20) NOT NULL primary key,
        name CHAR(20),
        color CHAR(20),
        weight FLOAT,
        city CHAR(20)
    );
""")
cursor.execute("""
    CREATE TABLE supplierProduct (
        sid CHAR(20),
        FOREIGN KEY(sid) REFERENCES supplier(id),
        pid CHAR(20),
        FOREIGN KEY (pid) REFERENCES product(id),
        qty INT,
        PRIMARY KEY(sid,pid)
    );
""")


In [35]:
# Adding data into the created dataset

cursor.execute("INSERT INTO supplier(ID, NAME, STATUS, CITY) VALUES ('S1', 'Smith', 20, 'London');")
cursor.execute("INSERT INTO supplier(ID, NAME, STATUS, CITY) VALUES ('S2', 'Jones', 10, 'Paris');")
cursor.execute("INSERT INTO supplier(ID, NAME, STATUS, CITY) VALUES ('S3', 'Blake', 30, 'Paris');")
cursor.execute("INSERT INTO supplier(ID, NAME, STATUS, CITY) VALUES ('S4', 'Clark', 20, 'London');")
cursor.execute("INSERT INTO supplier(ID, NAME, STATUS, CITY) VALUES ('S5', 'Adams', 30, 'Athens');")

cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P1', 'Nut', 'Red', 12.0, 'London');")
cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P2', 'Bolt', 'Green', 17.0, 'Paris');")
cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P3', 'Screw', 'Blue', 17.0, 'Oslo');")
cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P4', 'Screw', 'Red', 14.0, 'London');")
cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P5', 'Cam', 'Blue', 12.0, 'Paris');")
cursor.execute("INSERT INTO product(ID, NAME, COLOR, WEIGHT, CITY) VALUE ('P6', 'Cog', 'Red', 19.0, 'London');")

cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S1', 'P1', 200);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S2', 'P3', 400);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S2', 'P5', 100);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S3', 'P3', 200);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S3', 'P4', 500);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S4', 'P6', 300);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P2', 200);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P5', 500);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P6', 200);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P1', 100);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P3', 200);")
cursor.execute("INSERT INTO supplierProduct(SID, PID, QTY) VALUE ('S5', 'P4', 800);")

conn.commit()

In [36]:
cursor.close()
conn.close()

In [1]:
# Creating a dataframe from this database
# !!! We need to use JDBC (Java Database Connectivity) to connect Spark to our MySQL database and load a table as a Spark DataFrame.
# I have downloaded the platform-agnostic version and unzipped to /Users/chkapsalis/Documents/mysql-connector-j-9.2.0

# I need to restart the kernel so as to be able to create more SparkSessions
# I also need to start the mysql process in the background (terminal -> "mysql.server start")
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
            .config("spark.jars", "/Users/chkapsalis/Documents/mysql-connector-j-9.2.0/mysql-connector-j-9.2.0.jar") \
            .appName("app") \
            .getOrCreate()


25/03/16 21:21:55 WARN Utils: Your hostname, ChristoorossAir resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
25/03/16 21:21:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/03/16 21:21:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://localhost:3306/suppliersproducts") \
        .option("driver", "com.mysql.jdbc.Driver") \
        .option("dbtable", "product") \
        .option("user", "ckapsalis") \
        .option("password", "ckapsalis") \
        .load()

df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: double (nullable = true)
 |-- city: string (nullable = true)

+--------------------+--------------------+--------------------+------+--------------------+
|                  id|                name|               color|weight|                city|
+--------------------+--------------------+--------------------+------+--------------------+
|P1                  |Nut                 |Red                 |  12.0|London              |
|P2                  |Bolt                |Green               |  17.0|Paris               |
|P3                  |Screw               |Blue                |  17.0|Oslo                |
|P4                  |Screw               |Red                 |  14.0|London              |
|P5                  |Cam                 |Blue                |  12.0|Paris               |
|P6                  |Cog                 |Red                 |  19.

In [1]:
# 5th way - from pandas dataframe 
import pandas as pd 
from pyspark.sql import SparkSession


data = [
    ['Nick', 26],
    ['Helen', 28],
    ['Mary', 30],
    ['John', 29]
]

df = pd.DataFrame(data, columns=["Name", "Age"])
print(df)

    Name  Age
0   Nick   26
1  Helen   28
2   Mary   30
3   John   29


In [3]:
spark = SparkSession.builder.master("local[1]") \
            .appName("app") \
            .getOrCreate()


sdf = spark.createDataFrame(df)
sdf.printSchema()
sdf.show()

25/03/16 21:26:35 WARN Utils: Your hostname, ChristoorossAir resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
25/03/16 21:26:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/16 21:26:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



                                                                                

+-----+---+
| Name|Age|
+-----+---+
| Nick| 26|
|Helen| 28|
| Mary| 30|
| John| 29|
+-----+---+



```



```

## Spark DataFrame Methods

In [5]:
# Print the schema in a tree format
sdf.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [6]:
# Select only the "name" column
sdf.select("Name").show()

+-----+
| Name|
+-----+
| Nick|
|Helen|
| Mary|
| John|
+-----+



In [7]:
# Selecting Columns
sdf.select(sdf["Name"], sdf["Age"]+1).show()

+-----+---------+
| Name|(Age + 1)|
+-----+---------+
| Nick|       27|
|Helen|       29|
| Mary|       31|
| John|       30|
+-----+---------+



In [8]:
# Filtering Columns
sdf.filter(sdf["Age"] >= 28).show()

+-----+---+
| Name|Age|
+-----+---+
|Helen| 28|
| Mary| 30|
| John| 29|
+-----+---+



In [9]:
# Grouping
sdf.groupBy("Age").count().show()

+---+-----+
|Age|count|
+---+-----+
| 26|    1|
| 29|    1|
| 28|    1|
| 30|    1|
+---+-----+

