In [1]:
import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as s_f

from pathlib import Path

In [2]:
spark = SparkSession.builder.appName('sparkDataManagement').getOrCreate()

In [8]:
DATA_DIR = Path("../data/faults")


In [9]:
faults_df = spark.read.parquet(os.path.join(DATA_DIR, 'cleaned.parquet'))

In [10]:
faults_df = faults_df.withColumn('Other_Faults', faults_df['Other_Faults'].cast('int'))

In [11]:
faults_df.select('named_faults', 'Other_Faults').distinct().show()

+------------+------------+
|named_faults|Other_Faults|
+------------+------------+
|         1.0|           0|
|        null|        null|
|         0.0|           1|
+------------+------------+



In [12]:
## Quantiles
faults_df.select("x_min").dropna().approxQuantile("x_min", [0.5], 0)
faults_df.select("x_min").dropna().approxQuantile("x_min", [0.25, 0.5, 0.75], 0)
faults_df.select("x_min").describe().show()

+-------+-----------------+
|summary|            x_min|
+-------+-----------------+
|  count|             1941|
|   mean|571.1360123647604|
| stddev| 520.690671421655|
|    min|                0|
|    max|             1705|
+-------+-----------------+



In [13]:
#### Bivariate Summaries
faults_df.corr("x_min", "X_Maximum") # Pearson correlation coef

faults_df.corr("x_min","Y_Minimum")
faults_df.corr("x_min","Other_Faults")


0.1648036244443092

In [14]:
faults_df.cov("named_faults", "Other_Faults") # Covariance

-0.22394371059662085

In [15]:
## Cross tabs
faults_df.crosstab("named_faults", "Other_Faults").show()

+-------------------------+----+---+----+
|named_faults_Other_Faults|   0|  1|null|
+-------------------------+----+---+----+
|                      1.0|1253|  0|   0|
|                     null|   0|  0|  15|
|                      0.0|   0|673|   0|
+-------------------------+----+---+----+



In [16]:
## Groupby Summaries
faults_df.groupBy("named_faults").count().show()

faults_df.groupBy('named_faults').avg('x_min').show()
faults_df.groupBy('Pastry').avg('x_min').show()

faults_df.groupBy('named_faults', 'Pastry').avg('x_min').show()

+------------+-----+
|named_faults|count|
+------------+-----+
|         0.0|  673|
|        null|   15|
|         1.0| 1253|
+------------+-----+

+------------+-----------------+
|named_faults|       avg(x_min)|
+------------+-----------------+
|         0.0|688.8930163447251|
|        null|            513.0|
|         1.0|508.5833998403831|
+------------+-----------------+

+------+-----------------+
|Pastry|       avg(x_min)|
+------+-----------------+
|   0.0|552.5633882888004|
|  null|378.7083333333333|
|   1.0| 807.132911392405|
+------+-----------------+

+------------+------+------------------+
|named_faults|Pastry|        avg(x_min)|
+------------+------+------------------+
|        null|   0.0| 579.1538461538462|
|         0.0|  null|             341.6|
|        null|  null|              41.0|
|         1.0|   1.0| 811.4777070063694|
|        null|   1.0|             125.0|
|         1.0|  null|407.77777777777777|
|         1.0|   0.0| 466.1530612244898|
|         0.0|   0.0

In [17]:
## Pivot results

faults_df\
    .dropna(subset='Pastry')\
    .groupBy('named_faults')\
    .pivot('Pastry')\
    .avg('x_min')\
    .show()

+------------+-----------------+-----------------+
|named_faults|              0.0|              1.0|
+------------+-----------------+-----------------+
|         0.0|691.4925149700599|             null|
|        null|579.1538461538462|            125.0|
|         1.0|466.1530612244898|811.4777070063694|
+------------+-----------------+-----------------+



In [18]:
## Split Data Into Random Subsets
faults_df_train, faults_df_test = \
    faults_df.randomSplit([0.6, 0.4], 20)

In [19]:
## Get 10% sample
faults_df.sample(0.2).select("x_min", "Pastry", "named_faults").show()

+-----+------+------------+
|x_min|Pastry|named_faults|
+-----+------+------------+
|  645|   1.0|         1.0|
|  853|   1.0|         1.0|
|  106|   1.0|         1.0|
|  581|   1.0|         1.0|
|  451|   1.0|         1.0|
|   49|   1.0|         1.0|
|   91|   1.0|         1.0|
|  161|   1.0|         1.0|
| 1625|   1.0|         1.0|
| 1536|   1.0|         1.0|
|  765|   1.0|         1.0|
| 1268|   1.0|         1.0|
| 1287|   1.0|         1.0|
| 1088|   1.0|         1.0|
|  626|   1.0|         1.0|
|   34|   1.0|         1.0|
| 1321|   1.0|         1.0|
|   57|   1.0|         1.0|
|  320|   1.0|         1.0|
| 1343|   1.0|         1.0|
+-----+------+------------+
only showing top 20 rows



In [15]:
### Exporting to other formats
## Convert to Pandas DataFrame
## This brings full data into memory on driver, only do
## with smaller datasets (like results you want to plot...)
faults_pd = faults_df.toPandas()


#### Persistent Objects
## Save to parquet file
output_parquet_file = os.path.join(DATA_DIR, "faults_data_management.parquet")
faults_df.write.parquet(output_parquet_file, mode='overwrite')