<h1>SQL Server 2019 Big Data Cluster</h1>


## Try to connect to the SQL Server master instance

In [1]:
select @@version, @@servername

(No column name),(No column name).1
Microsoft SQL Server 2019 (RTM-CU10) (KB5001090) - 15.0.4123.1 (X64) Mar 22 2021 18:10:24 Copyright (C) 2019 Microsoft Corporation 	Developer Edition (64-bit) on Linux (Ubuntu 20.04.2 LTS) <X64>,master-0


## Create a demo database

In [2]:
CREATE DATABASE DemoDB;
GO

## and an external datasource

In [3]:
USE DemoDB;
GO
CREATE EXTERNAL DATA SOURCE [SqlStoragePool]
    WITH (LOCATION = N'sqlhdfs://controller-svc:8080/default');

## Viewing the file : 6 header rows to exclude
## So, creating a file format accordingly

In [4]:
USE DemoDB;
GO
CREATE EXTERNAL FILE FORMAT [FileFormat_WxLog]
    WITH (FORMAT_TYPE = DELIMITEDTEXT, 
          FORMAT_OPTIONS (FIELD_TERMINATOR = N';', 
                          STRING_DELIMITER = N'\"', 
                          FIRST_ROW = 7));

## and then create the external table with a structure reflecting CSV structure file

In [5]:
USE DemoDB;
GO
CREATE EXTERNAL TABLE [dbo].[WxLog]
(
    [Date] [varchar](50) NOT NULL,
	[Time] [varchar](50) NOT NULL,
	[Baro] [varchar](50) NULL,
	[QNH] [varchar](50) NULL,
	[Gust Speed] [varchar](50) NULL,
	[Gust Dir] [varchar](50) NULL,
	[Avg Speed] [varchar](50) NULL,
	[Avg Dir] [varchar](50) NULL,
	[Rain Rate] [varchar](50) NULL,
	[Rain] [varchar](50) NULL,
	[UV] [varchar](50) NULL,
	[Temp 0] [varchar](50) NULL,
	[DewPt 0] [varchar](50) NULL,
	[RH 0] [varchar](50) NULL,
	[Temp 1] [varchar](50) NULL,
	[DewPt 1] [varchar](50) NULL,
	[RH 1] [varchar](50) NULL,
	[Temp 2] [varchar](50) NULL,
	[DewPt 2] [varchar](50) NULL,
	[RH 2] [varchar](50) NULL
)
WITH (LOCATION = N'/meteo/WxLog.csv', 
      DATA_SOURCE = [SqlStoragePool], 
	  FILE_FORMAT = [FileFormat_WxLog]);

## Now we can run some testing

In [6]:
USE DemoDB;
GO
SELECT TOP 10 * FROM [dbo].[WxLog]

Date,Time,Baro,QNH,Gust Speed,Gust Dir,Avg Speed,Avg Dir,Rain Rate,Rain,UV,Temp 0,DewPt 0,RH 0,Temp 1,DewPt 1,RH 1,Temp 2,DewPt 2,RH 2
11/05/2013,13:25,1024.0,1024.0,16.92,270,13.32,284,0.0,366.5,,21.4,8.66,44,16.9,-1.72,28,19.1,19.1,100.0
11/05/2013,13:26,1024.0,1024.0,16.92,248,14.11,266,0.0,366.5,,21.4,8.66,44,16.9,-1.72,28,19.2,19.2,100.0
11/05/2013,13:27,1024.0,1024.0,16.2,248,13.29,262,0.0,366.5,,21.4,8.66,44,16.85,-1.76,28,19.2,19.2,100.0
11/05/2013,13:28,1024.0,1024.0,16.92,293,12.5,293,0.0,366.5,,21.4,8.66,44,17.0,-1.63,28,19.3,19.3,100.0
11/05/2013,13:29,1024.0,1024.0,9.36,248,12.08,286,0.0,366.5,,21.4,8.66,44,17.2,-1.46,28,19.3,19.3,100.0
11/05/2013,13:30,1024.0,1024.0,14.04,293,8.71,268,0.0,366.5,,21.4,8.66,44,17.3,-1.37,28,,,
11/05/2013,13:31,1024.0,1024.0,9.72,293,9.0,293,0.0,366.5,,21.4,8.66,44,17.4,-1.29,28,19.5,19.5,100.0
11/05/2013,13:32,1024.0,1024.0,12.6,293,9.79,288,0.0,366.5,,21.4,8.66,44,17.6,-0.63,29,19.6,19.6,100.0
11/05/2013,13:33,1024.0,1024.0,12.24,293,9.43,266,0.0,366.5,,21.4,8.66,44,17.5,-1.2,28,19.6,19.6,100.0
11/05/2013,13:34,1024.0,1024.0,12.6,270,9.62,266,0.0,366.5,,21.4,8.66,44,17.85,-0.42,29,19.5,19.5,100.0


## Playing with multiple files is easy too

In [10]:
USE DemoDB;
GO
CREATE EXTERNAL FILE FORMAT [FileFormat_IoT]
    WITH (FORMAT_TYPE = DELIMITEDTEXT, 
          FORMAT_OPTIONS (FIELD_TERMINATOR = N',', 
                          STRING_DELIMITER = N'\"', 
                          FIRST_ROW = 2));

In [11]:
USE DemoDB;
GO
CREATE EXTERNAL TABLE [dbo].[IoTsensors]
(
    [DateTime] [varchar](50) NULL,
    [Humidity] [varchar](50) NULL,
    [Temperature] [varchar](50) NULL,
    [Temperature_range (low)] [varchar](50) NULL,
    [Temperature_range (high)] [varchar](50) NULL
)
WITH (LOCATION = N'/csvfiles', 
      DATA_SOURCE = [SqlStoragePool], 
	  FILE_FORMAT = [FileFormat_IoT]);

In [13]:
USE DemoDB;
GO
SELECT count(*) FROM [dbo].[IoTsensors]
SELECT TOP 10 * FROM [dbo].[IoTsensors]

(No column name)
2190


DateTime,Humidity,Temperature,Temperature_range (low),Temperature_range (high)
2018-05-15 00:00:00,88,11.83,10.5,13.6
2018-05-16 00:00:00,83,13.47,11.7,16.6
2018-05-17 00:00:00,84,14.69,12.9,18.1
2018-05-18 00:00:00,82,15.91,11.1,20.8
2018-05-19 00:00:00,76,17.69,13.9,21.6
2018-05-20 00:00:00,67,19.07,12.4,24.5
2018-05-21 00:00:00,65,19.26,14.3,23.7
2018-05-22 00:00:00,69,19.31,16.0,22.7
2018-05-23 00:00:00,63,20.69,14.4,25.5
2018-05-24 00:00:00,67,21.14,16.5,26.6


# We can switch to Spark engine to query the HDFS storage
## with the PySpark kernel



In [3]:
allfiles = spark.read \
    .option("inferSchema", "true") \
    .csv('/csvfiles/*.csv') \
    .toDF("DateTime","Humidity","Temperature","Temperature_range (low)","Temperature_range (high)")
allfiles.count()

The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [11]:
allfiles.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------+-----------+-----------------------+------------------------+
|           DateTime|Humidity|Temperature|Temperature_range (low)|Temperature_range (high)|
+-------------------+--------+-----------+-----------------------+------------------------+
|2018-05-14 00:00:00|    55.0|      19.35|                   19.2|                    19.5|
|2018-05-15 00:00:00|    59.0|      19.22|                   19.1|                    19.4|
|2018-05-16 00:00:00|    61.0|      19.38|                   19.0|                    20.0|
|2018-05-17 00:00:00|    64.0|      19.77|                   19.2|                    20.5|
|2018-05-18 00:00:00|    65.0|      20.26|                   19.4|                    21.1|
+-------------------+--------+-----------+-----------------------+------------------------+
only showing top 5 rows

In [9]:
allfiles.filter("Humidity < 70").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------+-----------+-----------------------+------------------------+
|           DateTime|Humidity|Temperature|Temperature_range (low)|Temperature_range (high)|
+-------------------+--------+-----------+-----------------------+------------------------+
|2018-05-14 00:00:00|    55.0|      19.35|                   19.2|                    19.5|
|2018-05-15 00:00:00|    59.0|      19.22|                   19.1|                    19.4|
|2018-05-16 00:00:00|    61.0|      19.38|                   19.0|                    20.0|
|2018-05-17 00:00:00|    64.0|      19.77|                   19.2|                    20.5|
|2018-05-18 00:00:00|    65.0|      20.26|                   19.4|                    21.1|
|2018-05-19 00:00:00|    66.0|      20.86|                   20.2|                    21.5|
|2018-05-20 00:00:00|    64.0|       21.3|                   20.0|                    22.2|
|2018-05-21 00:00:00|    62.0|      21.82|                   20.7|              

In [10]:
allfiles.filter("Humidity < 70").filter("Temperature < 10").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------+-----------+-----------------------+------------------------+
|           DateTime|Humidity|Temperature|Temperature_range (low)|Temperature_range (high)|
+-------------------+--------+-----------+-----------------------+------------------------+
|2018-10-30 00:00:00|    68.0|       9.16|                    4.5|                    23.6|
|2018-11-19 00:00:00|    66.0|       7.92|                    4.1|                    11.5|
|2019-01-03 00:00:00|    68.0|       1.67|                   -2.2|                     8.0|
|2019-01-04 00:00:00|    67.0|      -0.09|                   -3.6|                     3.0|
|2019-01-05 00:00:00|    69.0|       0.98|                   -2.2|                     3.9|
|2019-01-10 00:00:00|    68.0|       3.13|                    0.6|                    11.2|
|2019-01-11 00:00:00|    64.0|       2.89|                   -0.1|                    11.9|
|2019-01-24 00:00:00|    68.0|        2.3|                    0.9|              

## We can switch a TSQL like syntax to query the dataframe

In [12]:
allfiles.select("Temperature","Humidity").show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+--------+
|Temperature|Humidity|
+-----------+--------+
|      19.35|    55.0|
|      19.22|    59.0|
|      19.38|    61.0|
|      19.77|    64.0|
|      20.26|    65.0|
+-----------+--------+
only showing top 5 rows

In [7]:
allfiles.select("Temperature","Humidity").summary().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+------------------+------------------+
|summary|       Temperature|          Humidity|
+-------+------------------+------------------+
|  count|              2196|              2196|
|   mean| 20.37647325436153| 56.98155216931124|
| stddev|5.7351560117159535|12.564311175829408|
|    min|             -0.09|              33.0|
|    25%|             20.25|              47.0|
|    50%|             22.08|              55.0|
|    75%|             23.47|              65.0|
|    max|             30.28|              94.0|
+-------+------------------+------------------+

## And even some T-SQL select statements

In [13]:
allfiles.createOrReplaceTempView("poolhouse")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
spark.sql("SELECT * from poolhouse LIMIT 10").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------+-----------+-----------------------+------------------------+
|           DateTime|Humidity|Temperature|Temperature_range (low)|Temperature_range (high)|
+-------------------+--------+-----------+-----------------------+------------------------+
|2018-05-14 00:00:00|    80.0|      10.06|                    8.8|                    11.2|
|2018-05-15 00:00:00|    88.0|      11.83|                   10.5|                    13.6|
|2018-05-16 00:00:00|    83.0|      13.47|                   11.7|                    16.6|
|2018-05-17 00:00:00|    84.0|      14.69|                   12.9|                    18.1|
|2018-05-18 00:00:00|    82.0|      15.91|                   11.1|                    20.8|
|2018-05-19 00:00:00|    76.0|      17.69|                   13.9|                    21.6|
|2018-05-20 00:00:00|    67.0|      19.07|                   12.4|                    24.5|
|2018-05-21 00:00:00|    65.0|      19.26|                   14.3|              

In [16]:
spark.sql("SELECT MIN(Temperature),MAX(Temperature),AVG(Temperature) from poolhouse").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+----------------+-----------------+
|min(Temperature)|max(Temperature)| avg(Temperature)|
+----------------+----------------+-----------------+
|           -0.09|           30.28|20.37647325436153|
+----------------+----------------+-----------------+

In [17]:
spark.sql("SELECT DateTime,Temperature,LEAD(Temperature) OVER (order by DateTime) as NextValue,avg(Temperature) OVER () as avgTemp from poolhouse").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-----------+---------+-----------------+
|           DateTime|Temperature|NextValue|          avgTemp|
+-------------------+-----------+---------+-----------------+
|2018-05-14 00:00:00|      21.32|    21.78|20.37647325436154|
|2018-05-14 00:00:00|      21.78|    10.06|20.37647325436154|
|2018-05-14 00:00:00|      10.06|    10.06|20.37647325436154|
|2018-05-14 00:00:00|      10.06|    19.35|20.37647325436154|
|2018-05-14 00:00:00|      19.35|    19.75|20.37647325436154|
|2018-05-14 00:00:00|      19.75|    21.27|20.37647325436154|
|2018-05-15 00:00:00|      21.27|    21.61|20.37647325436154|
|2018-05-15 00:00:00|      21.61|    11.83|20.37647325436154|
|2018-05-15 00:00:00|      11.83|    11.83|20.37647325436154|
|2018-05-15 00:00:00|      11.83|    19.22|20.37647325436154|
|2018-05-15 00:00:00|      19.22|    19.67|20.37647325436154|
|2018-05-15 00:00:00|      19.67|    21.15|20.37647325436154|
|2018-05-16 00:00:00|      21.15|    21.73|20.37647325436154|
|2018-05

## It is also possible to use the JOIN operator between dataframes

In [19]:
bathroom = spark.read \
    .option("inferSchema", "true") \
    .csv('/csvfiles/temperature-last-year_salledebain.csv') \
    .toDF("DateTime","Humidity","Temperature","Temperature_range (low)","Temperature_range (high)")

livingroom = spark.read \
    .option("inferSchema", "true") \
    .csv('/csvfiles/temperature-last-year_sejour.csv') \
    .toDF("DateTime","Humidity","Temperature","Temperature_range (low)","Temperature_range (high)")    

bathroom.select("DateTime","temperature","Humidity").join(livingroom.select("DateTime","temperature","Humidity"),"DateTime").show(10)  


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-----------+--------+-----------+--------+
|           DateTime|temperature|Humidity|temperature|Humidity|
+-------------------+-----------+--------+-----------+--------+
|2018-05-14 00:00:00|      21.32|    57.0|      21.78|    48.0|
|2018-05-15 00:00:00|      21.27|    58.0|      21.61|    50.0|
|2018-05-16 00:00:00|      21.15|    61.0|      21.73|    52.0|
|2018-05-17 00:00:00|      21.14|    64.0|      22.14|    55.0|
|2018-05-18 00:00:00|      21.63|    67.0|      22.53|    56.0|
|2018-05-19 00:00:00|      21.83|    68.0|      22.39|    59.0|
|2018-05-20 00:00:00|      21.78|    64.0|      22.84|    57.0|
|2018-05-21 00:00:00|       22.1|    63.0|      22.94|    57.0|
|2018-05-22 00:00:00|      22.55|    68.0|      23.25|    58.0|
|2018-05-23 00:00:00|      22.74|    68.0|      23.58|    57.0|
+-------------------+-----------+--------+-----------+--------+
only showing top 10 rows