In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f
import numpy as np

In [2]:
! pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.1.2
      /_/
                        
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 1.8.0_312
Branch HEAD
Compiled by user centos on 2021-05-24T04:27:48Z
Revision de351e30a90dd988b133b3d00fa6218bfcaba8b8
Url https://github.com/apache/spark
Type --help for more information.


In [3]:
data_path = '/media/daniel/Seagate Basic/spark_data/sensores/' # en esta carpeta deben encontrarse los 4 ficheros

In [4]:
spark = SparkSession.builder.master("local[*]")\
          .appName("practica 2 Ecosistema Spark parte 1")\
          .getOrCreate()

22/01/09 11:22:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# 1. Creación de dataframes proporcionando el schema

Los cuatro conjuntos de datos comparten la misma estructura, asique se usa el mismo esquema para todos.

In [5]:
schema = StructType([StructField('index', IntegerType(), True),
                     StructField('arrival_time', LongType(), True),
                     StructField('creation_time', LongType(), True),
                     StructField('x', FloatType(), True),
                     StructField('y', FloatType(), True),
                     StructField('z', FloatType(), True),
                     StructField('user', StringType(), True),
                     StructField('model', StringType(), True),
                     StructField('device', StringType(), True),
                     StructField('gt', StringType(), True)])

In [6]:
df_phones_acc   = spark.read.csv(data_path+'Phones_accelerometer.csv', schema=schema)
df_phones_gyros = spark.read.csv(data_path+'Phones_gyroscope.csv', schema=schema)
df_watch_acc    = spark.read.csv(data_path+'Watch_accelerometer.csv', schema=schema)
df_watch_gyros  = spark.read.csv(data_path+'Watch_gyroscope.csv', schema=schema)

# 2. Resumen de cada dataframe

Se calcula la media, desviación típica, máximo y mínimo de las variables x, y, z agrupadas por usuario, modelo y acción. Nótese que se calculan todos los estadísticos en una sola iteración.

In [59]:
def summarize_dataframe(spark_df, groupby=['user', 'model', 'gt'], aggregate=['x', 'y', 'z'], functions = [f.mean, f.stddev, f.min, f.max]):
    
    # descripcion
    #----------------
    
    # Agrega las columnas de un dataframe agrupando por las columnas seleccionadas. Puede haber multiples funciones de agregado.
    
    # argumentos:
    #----------------
    
    #<spark_df> | tipo: pyspark.sql.DataFrame
    #<groupby> | tipo: list | contenido: str
    #<aggregate> | tipo: list | contenido: str
    #<functions> | tipo: list | contenido: pyspark.sql.functions
    
    # resultado: 
    #----------------
    
    #<result> | tipo: pyspark.sql.DataFrame | descriptivo: dataframe de spark con los estadísticos calculados y las columnas de agregado
    
    
    exprs = [fun(f.col(c)) for fun in functions for c in aggregate]
    result = df_phones_acc.groupby(*groupby).agg(*exprs)
    
    for c in result.columns[len(groupby):]: # se almacenan solo dos decimales por sencillez en la lectura.
        
        result = result.withColumn(c, f.round(c, 2))
        
        
    return result

In [60]:
summary_phones_acc = summarize_dataframe(df_phones_acc)
summary_phones_gyros = summarize_dataframe(df_phones_gyros)
summary_watch_acc = summarize_dataframe(df_watch_acc)
summary_watch_gyros = summarize_dataframe(df_watch_gyros)

In [61]:
summary_watch_gyros.show(2)



+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|user|     model|        gt|avg(x)|avg(y)|avg(z)|stddev_samp(x)|stddev_samp(y)|stddev_samp(z)|min(x)|min(y)|min(z)|max(x)|max(y)|max(z)|
+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|   d|    s3mini|stairsdown| -4.46|  0.24|  8.47|          2.24|          1.94|          2.92|-15.55| -7.76|  0.42|  3.94|  8.83| 19.61|
|   h|samsungold|stairsdown| -5.64|  0.24|  8.06|          2.58|          2.02|          3.54|-18.39| -9.04| -0.15|  1.99| 10.42| 19.46|
+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
only showing top 2 rows



                                                                                

# 3. Join de cada dataframe del mismo dispositivo

In [62]:
summary_phones = summary_phones_acc.join(summary_phones_gyros, on=['user', 'model', 'gt'], how='left')
summary_watch = summary_watch_acc.join(summary_watch_gyros, on=['user', 'model', 'gt'], how='left')

In [65]:
summary_phones.show(2)

                                                                                

+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|user|     model|        gt|avg(x)|avg(y)|avg(z)|stddev_samp(x)|stddev_samp(y)|stddev_samp(z)|min(x)|min(y)|min(z)|max(x)|max(y)|max(z)|avg(x)|avg(y)|avg(z)|stddev_samp(x)|stddev_samp(y)|stddev_samp(z)|min(x)|min(y)|min(z)|max(x)|max(y)|max(z)|
+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|   d|    s3mini|stairsdown| -4.46|  0.24|  8.47|          2.24|          1.94|          2.92|-15.55| -7.76|  0.42|  3.94|  8.83| 19.61| -4.46|  0.24|  8.47|          2.24|          1.94|          2.92|-15.55| -7.76|  0.42|  3.94|  8.83| 19.61|
|   h|samsungold|sta

In [66]:
summary_phones.count(), summary_watch.count() 

                                                                                

(252, 252)

# 4. Unión de los dataframe de ambos dispositivos

In [67]:
summary = summary_phones.union(summary_watch)

In [68]:
summary.show(2)



+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|user|     model|        gt|avg(x)|avg(y)|avg(z)|stddev_samp(x)|stddev_samp(y)|stddev_samp(z)|min(x)|min(y)|min(z)|max(x)|max(y)|max(z)|avg(x)|avg(y)|avg(z)|stddev_samp(x)|stddev_samp(y)|stddev_samp(z)|min(x)|min(y)|min(z)|max(x)|max(y)|max(z)|
+----+----------+----------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+------+------+------+--------------+--------------+--------------+------+------+------+------+------+------+
|   d|    s3mini|stairsdown| -4.46|  0.24|  8.47|          2.24|          1.94|          2.92|-15.55| -7.76|  0.42|  3.94|  8.83| 19.61| -4.46|  0.24|  8.47|          2.24|          1.94|          2.92|-15.55| -7.76|  0.42|  3.94|  8.83| 19.61|
|   h|samsungold|sta

                                                                                

In [69]:
summary.count() # la dimension del dataframe final debe coincidir con las dimensiones de cada uno por separado

                                                                                

504