# Analyse the Data Quality from December 2019

**Note: The analysis were not continued because the focus was on the historical data of the CouchDB.**

To use the notebook, the following Python modules must be installed from the command line.


`pip install pandas`

`pip install tqdm`

In [1]:
import findspark
findspark.init() #necessary to find the local spark
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from pyspark.sql import Row, SparkSession
from pyspark import SparkContext
from pyspark.sql.types import IntegerType, StringType, DoubleType, BooleanType, TimestampType, StructField, StructType
from pyspark.sql.functions import col, sum

In [2]:
sc = SparkContext()
spark = SparkSession.builder\
        .getOrCreate()

**Note:** Before the files can be read, they must be transferred to the HDFS. (For procedure, see documentation / installation instructions)

## asr32.json - Hochregallager/Palettenlager

In [3]:
df_asr = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/ASR32.json')

In [4]:
# rename columns
df_asr = df_asr.withColumnRenamed("Mes.diONo","Mes_diONo")\
        .withColumnRenamed("Mes.diPNo","Mes_diPNo")\
        .withColumnRenamed("Mes.iOPos","Mes_iOPos")\
        .withColumnRenamed("Mes.iOpNo","Mes_iOpNo")\
        .withColumnRenamed("Mes.iResourceId","Mes_iResourceId")

In [5]:
# Creates a temporary view using the DataFrame
df_asr.createOrReplaceTempView("asr32")

In [6]:
# df.first()
df_asr.printSchema()

root
 |-- Mes_diONo: string (nullable = true)
 |-- Mes_diPNo: string (nullable = true)
 |-- Mes_iOPos: string (nullable = true)
 |-- Mes_iOpNo: string (nullable = true)
 |-- Mes_iResourceId: string (nullable = true)
 |-- broker_timestamp: long (nullable = true)
 |-- iCarrierID: string (nullable = true)
 |-- iCode: string (nullable = true)
 |-- iPar1: string (nullable = true)
 |-- iPar2: string (nullable = true)
 |-- iPar3: string (nullable = true)
 |-- iPar4: string (nullable = true)
 |-- modul: string (nullable = true)
 |-- sensor: string (nullable = true)
 |-- subpart: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- topic: string (nullable = true)
 |-- value: boolean (nullable = true)



In [7]:
# no. rows, no. columns
print((df_asr.count(), len(df_asr.columns)))

(7820, 18)


In [9]:
spark.sql("SELECT Mes_diONo, Mes_diPNo, Mes_iOPos, Mes_iOpNo, Mes_iResourceId, iCarrierID, iCode, \
            iPar1, iPar2, iPar3, iPar4, sensor, subpart, value FROM asr32").show(30, False)

+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|Mes_diONo|Mes_diPNo|Mes_iOPos|Mes_iOpNo|Mes_iResourceId|iCarrierID|iCode|iPar1|iPar2|iPar3|iPar4|sensor  |subpart|value|
+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG36|Band2  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xK1_MB20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |false|
|null     |null     |nul

In [10]:
# frequency distribution for iCarrierID and value
df_asr.crosstab('iCarrierID', 'value').show(10)

+----------------+-----+----+----+
|iCarrierID_value|false|null|true|
+----------------+-----+----+----+
|            null| 3654|   0|3658|
|              12|    0|  26|   0|
|               8|    0|  28|   0|
|              23|    0|  35|   0|
|               4|    0|  26|   0|
|              15|    0|  29|   0|
|              11|    0|  28|   0|
|               9|    0|  22|   0|
|              22|    0|   3|   0|
|              13|    0|  33|   0|
+----------------+-----+----+----+
only showing top 10 rows



#### Number of Null-Value per Column

In [115]:
 for col in df_asr.columns:
        sql_string = "SELECT COUNT(*) - COUNT({}) AS NULL_VALUES, COUNT({}) FROM asr32".format(col, col)
        spark.sql(sql_string).show(20, False)

+-----------+----------------+
|NULL_VALUES|count(Mes_diONo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_diPNo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_iOPos)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_iOpNo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------------+
|NULL_VALUES|count(Mes_iResourceId)|
+-----------+----------------------+
|7312       |508                   |
+-----------+----------------------+

+-----------+-----------------------+
|NULL_VALUES|count(broker_timestamp)|
+-----------+-----------------------+
|0          |7820                   |
+-----------+-----------------------+


#### sensor-values alternierend

In [16]:
df_asr.crosstab('sensor', 'value').show(100)

+------------+-----+----+----+
|sensor_value|false|null|true|
+------------+-----+----+----+
|    xLA_BG51|   16|   0|  17|
|    xG1_BG23|   40|   0|  40|
|     xQA1_A1|   29|   0|  29|
|    xK1_MB20|  254|   0| 254|
|    xG1_BG34|   21|   0|  20|
|    xLA_MB52|   17|   0|  16|
|    xG1_BG27|  254|   0| 254|
|    xG1_BG24|   27|   0|  27|
|    xLA_BG57|   33|   0|  33|
|    xLA_BG50|   17|   0|  16|
|    xLA_MB55|   62|   0|  62|
|    xU1_BG51|   58|   0|  58|
|     xQA2_A1|   50|   0|  50|
|    xLA_MB51|   58|   0|  58|
|    xLA_BG54|  110|   0| 110|
|           1|    0| 255|   0|
|    xLA_BG55|   31|   0|  31|
|    xG1_BG21|  255|   0| 255|
|    xG1_BG31|  253|   0| 254|
|    xG1_BG20|  254|   0| 255|
|    xLA_MB50|   58|   0|  58|
|    xK1_MB30|  253|   0| 253|
|    xG1_BG32|   40|   0|  40|
|    xLA_MB54|   62|   0|  62|
|           2|    0| 253|   0|
|    xLA_BG53|  109|   0| 109|
|    xG1_BG36|  254|   0| 256|
|    xLA_BG52|   62|   0|  62|
|   connected|    1|   0|   1|
|    xU1

## Heat.json - Ofen


In [32]:
df_Heat = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Heat.json')
# rename columns
df_Heat = df_Heat.withColumnRenamed("Mes.diONo","Mes_diONo")\
        .withColumnRenamed("Mes.diPNo","Mes_diPNo")\
        .withColumnRenamed("Mes.iOPos","Mes_iOPos")\
        .withColumnRenamed("Mes.iOpNo","Mes_iOpNo")\
        .withColumnRenamed("Mes.iResourceId","Mes_iResourceId")
df_Heat.createOrReplaceTempView("Heat")
df_Heat.printSchema()

root
 |-- Mes_diONo: string (nullable = true)
 |-- Mes_diPNo: string (nullable = true)
 |-- Mes_iOPos: string (nullable = true)
 |-- Mes_iOpNo: string (nullable = true)
 |-- Mes_iResourceId: string (nullable = true)
 |-- broker_timestamp: long (nullable = true)
 |-- iCarrierID: string (nullable = true)
 |-- iCode: string (nullable = true)
 |-- iPar1: string (nullable = true)
 |-- iPar2: string (nullable = true)
 |-- iPar3: string (nullable = true)
 |-- iPar4: string (nullable = true)
 |-- modul: string (nullable = true)
 |-- sensor: string (nullable = true)
 |-- subpart: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- topic: string (nullable = true)
 |-- value: string (nullable = true)



In [24]:
spark.sql("SELECT Mes_diONo, Mes_diPNo, Mes_iOPos, Mes_iOpNo, Mes_iResourceId, iCarrierID, iCode, iPar1, iPar2, iPar3, iPar4, sensor, subpart, value FROM asr32").show(30, False)

+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|Mes_diONo|Mes_diPNo|Mes_iOPos|Mes_iOpNo|Mes_iResourceId|iCarrierID|iCode|iPar1|iPar2|iPar3|iPar4|sensor  |subpart|value|
+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG36|Band2  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xK1_MB20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |false|
|null     |null     |nul

In [29]:
spark.sql("SELECT sensor, value FROM Heat WHERE value != 'false' AND value != 'true'").show(4, False)

+-------+---------+
|sensor |value    |
+-------+---------+
|rActVal|22.518808|
|rActVal|22.526043|
|rActVal|22.518808|
|rActVal|22.526043|
+-------+---------+
only showing top 4 rows



## iMag.json - Magazine Deckel oder Unterschale

In [18]:
df_imag = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/iMag.json')
# rename columns
df_imag = df_imag.withColumnRenamed("Mes.diONo","Mes_diONo")\
        .withColumnRenamed("Mes.diPNo","Mes_diPNo")\
        .withColumnRenamed("Mes.iOPos","Mes_iOPos")\
        .withColumnRenamed("Mes.iOpNo","Mes_iOpNo")\
        .withColumnRenamed("Mes.iResourceId","Mes_iResourceId")                     
df_imag.createOrReplaceTempView("imag")
df_imag.printSchema()

root
 |-- Mes_diONo: string (nullable = true)
 |-- Mes_diPNo: string (nullable = true)
 |-- Mes_iOPos: string (nullable = true)
 |-- Mes_iOpNo: string (nullable = true)
 |-- Mes_iResourceId: string (nullable = true)
 |-- broker_timestamp: long (nullable = true)
 |-- iCarrierID: string (nullable = true)
 |-- iCode: string (nullable = true)
 |-- iPar1: string (nullable = true)
 |-- iPar2: string (nullable = true)
 |-- iPar3: string (nullable = true)
 |-- iPar4: string (nullable = true)
 |-- modul: string (nullable = true)
 |-- sensor: string (nullable = true)
 |-- subpart: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- topic: string (nullable = true)
 |-- value: boolean (nullable = true)



In [19]:
spark.sql("SELECT Mes_diONo, Mes_diPNo, Mes_iOPos, Mes_iOpNo, Mes_iResourceId, iCarrierID, iCode, iPar1, iPar2, iPar3, iPar4, sensor, subpart, value FROM asr32").show(10, False)

+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|Mes_diONo|Mes_diPNo|Mes_iOPos|Mes_iOpNo|Mes_iResourceId|iCarrierID|iCode|iPar1|iPar2|iPar3|iPar4|sensor  |subpart|value|
+---------+---------+---------+---------+---------------+----------+-----+-----+-----+-----+-----+--------+-------+-----+
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG36|Band2  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xK1_MB20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG20|Band1  |true |
|null     |null     |null     |null     |null           |null      |null |null |null |null |null |xG1_BG21|Band1  |false|
|null     |null     |nul

#### Number of Null-Value per Column

In [21]:
for col in df_asr.columns:
    sql_string = "SELECT COUNT(*) - COUNT({}) AS NULL_VALUES, COUNT({}) FROM asr32".format(col, col)
    spark.sql(sql_string).show(20, False)

+-----------+----------------+
|NULL_VALUES|count(Mes_diONo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_diPNo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_iOPos)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------+
|NULL_VALUES|count(Mes_iOpNo)|
+-----------+----------------+
|7312       |508             |
+-----------+----------------+

+-----------+----------------------+
|NULL_VALUES|count(Mes_iResourceId)|
+-----------+----------------------+
|7312       |508                   |
+-----------+----------------------+

+-----------+-----------------------+
|NULL_VALUES|count(broker_timestamp)|
+-----------+-----------------------+
|0          |7820                   |
+-----------+-----------------------+


## Mpress.json - Industrie 4.0 Presse
has not generated any data

In [None]:
df_mpress = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Mpress.json')
df_mpress.createOrReplaceTempView("mpress")

In [None]:
df_mpress.printSchema()

In [None]:
spark.sql('SELECT * FROM mpress').show(10, False)

## RobotAssembly.json - Robotmontagezelle

In [None]:
df_robotassembly = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/RobotAssembly.json')
df_robotassembly.createOrReplaceTempView("robotassembly")
df_robotassembly.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM robotassembly').show(2, False)

## ShuntPickByLight.json - Handarbeitsplatz

In [None]:
df_ShuntPickByLight = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/ShuntPickByLight.json')
df_ShuntPickByLight.createOrReplaceTempView("ShuntPickByLight")
df_ShuntPickByLight.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM ShuntPickByLight').show(2, False)

In [None]:
spark.sql('SELECT sensor, value, count(value) FROM ShuntPickByLight GROUP BY sensor, value').show(2, False)

## Branch2.json

In [None]:
df_Branch2 = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Branch2.json')
df_Branch2.createOrReplaceTempView("Branch2")
df_Branch2.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM Branch2').show(2, False)

## Camera.json - optische Qualitätkontrolle

In [None]:
df_camera = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Camera.json')
df_camera.createOrReplaceTempView("camera")
df_camera.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM camera').show(2, False)

## Labeling.json - Etikettiermaschine
Daten generieren im Moment nicht möglich

In [None]:
df_Labeling = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Labeling.json')
df_Labeling.createOrReplaceTempView("Labeling")
df_Labeling.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM Labeling').show(2, False)

## PressPneu.json - Presse vor Robotor Zelle

In [None]:
df_PressPneu = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/PressPneu.json')
df_PressPneu.createOrReplaceTempView("PressPneu")
df_PressPneu.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM PressPneu').show(2, False)

## RobotDocking.json - Weiche zu Robotor

In [None]:
df_RobotDocking = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/RobotDocking.json')
df_RobotDocking.createOrReplaceTempView("RobotDocking")
df_RobotDocking.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM RobotDocking').show(2, False)

## ShuntMagazineBack.json - Magazine Deckel oder Unterschale

In [None]:
df_ShuntMagazineBack = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/ShuntMagazineBack.json')
df_ShuntMagazineBack.createOrReplaceTempView("ShuntMagazineBack")
df_ShuntMagazineBack.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM ShuntMagazineBack').show(2, False)

 
## Turn.json - Wenden - Greifarm


In [None]:
df_Turn = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Turn.json')
df_Turn.createOrReplaceTempView("Turn")
df_Turn.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM Turn').show(2, False)

## Counter.json - Anzahl an Carrier die passieren


In [None]:
df_counter = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/Counter.json')
df_counter.createOrReplaceTempView("counter")
df_counter.printSchema()

In [None]:
spark.sql('SELECT * FROM counter').show(10, False)

## RobotMill.json - CNC Maschine - nicht aktiv

In [None]:
df_robotmill = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/RobotMill.json')
df_robotmill.createOrReplaceTempView("robotmill")
df_robotmill.printSchema()

In [None]:
spark.sql('SELECT sensor, value, count(value) FROM robotmill GROUP BY sensor, value').show(2, False)

## BypassRD.json -  Materialbelieferung/entnahme - nicht aktiv

In [None]:
df_bypassrd = spark.read.json('hdfs://141.56.180.140:9000/user/hadoop/IotTestbed_Dateien/BypassRD.json')
df_bypassrd.createOrReplaceTempView("bypassrd")

In [None]:
df_bypassrd.printSchema()

In [None]:
spark.sql('SELECT sensor, value FROM bypassrd').show(10, False)