In [23]:
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import avg, sum, lag, rank
import time
import os
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3.5"

# 初始化SparkSession
spark = SparkSession.builder \
    .master("spark://master:7077") \
    .appName("SAM") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

sc = spark.sparkContext
gravity_data = spark.read.text("hdfs://master:9000/dataset/GravityData/1/高台_X212MPET0028_2015_1_9_原始观测数据.tsf")
gravity_data.persist()

DataFrame[value: string]

In [18]:
gravity_data.take(20)

[Row(value='[TSF-file] v01.0'),
 Row(value=''),
 Row(value='[UNDETVAL] 999999.000'),
 Row(value=''),
 Row(value='[TIMEFORMAT] DATETIME'),
 Row(value=''),
 Row(value='[INCREMENT]   1'),
 Row(value=''),
 Row(value='[CHANNELS]'),
 Row(value='������ϫ�۲�ֵ(2121)'),
 Row(value='�����ⲿ��ѹ(2128)'),
 Row(value=''),
 Row(value='[UNITS]'),
 Row(value='10-8��m/s2'),
 Row(value='Pa'),
 Row(value=''),
 Row(value=''),
 Row(value='[DATA]'),
 Row(value='2015  1   9   0   0   0   13121.61    882.81      '),
 Row(value='2015  1   9   0   0   1   13124.63    882.7       ')]

### 对数据进行预处理,去除前面的无用行
包含的操作有RDD与DataFrame的互换  
设置DataFrame的列名,还可以使用...toDF("year", "month", ...)  
cast()/astype()都能改变DataFrame的数据类型

In [19]:
gravity_fields = gravity_data.rdd.map(lambda row: row.value.split())
gravity_fields.take(5)
gravity_fields_preprocess = gravity_fields.filter(lambda row: len(row) > 3)\
                            .map(lambda row: Row(year=row[0], month=row[1], day=row[2], hour=row[3], \
                                 minute=row[4], second=row[5], gravity=row[6], air_pressure=row[7]))\
                            .toDF()

gravity_fields_preprocess.show(5)

+------------+---+--------+----+------+-----+------+----+
|air_pressure|day| gravity|hour|minute|month|second|year|
+------------+---+--------+----+------+-----+------+----+
|      882.81|  9|13121.61|   0|     0|    1|     0|2015|
|       882.7|  9|13124.63|   0|     0|    1|     1|2015|
|      882.81|  9| 13117.5|   0|     0|    1|     2|2015|
|      882.81|  9|13129.17|   0|     0|    1|     3|2015|
|      882.59|  9|13133.98|   0|     0|    1|     4|2015|
+------------+---+--------+----+------+-----+------+----+
only showing top 5 rows



## 移动平均

In [20]:
wSpec = Window.partitionBy("hour")\
                .orderBy("second")\
                .rowsBetween(-1, 1)
gravity_fields_preprocess.withColumn("movingAvg", avg(gravity_fields_preprocess["gravity"]).over(wSpec)).show(5)

+------------+---+--------+----+------+-----+------+----+------------------+
|air_pressure|day| gravity|hour|minute|month|second|year|         movingAvg|
+------------+---+--------+----+------+-----+------+----+------------------+
|      883.14|  9| 13146.3|   7|     0|    1|     0|2015|         13156.035|
|      883.03|  9|13165.77|   7|     1|    1|     0|2015|13154.013333333334|
|      883.03|  9|13149.97|   7|     2|    1|     0|2015|13160.463333333333|
|      883.14|  9|13165.65|   7|     3|    1|     0|2015|13160.233333333332|
|      883.25|  9|13165.08|   7|     4|    1|     0|2015|13163.230000000001|
+------------+---+--------+----+------+-----+------+----+------------------+
only showing top 5 rows



### 窗口函数主要包括两部分:
* 指定窗口特征(wSpec)  
partitionBy定义数据如何分组  
orderBy定义分组中的排序  
rowsBetween定义窗口大小，(-1, 1)表示从前一行到后一行
* 指定窗口函数的操作  
可以使用ｐｙｓｐａｒｋ.sql.functions的＂聚合函数(Aggregate Function)＂和＂窗口函数(Window Function)＂类别下的函数

## 累计求和

In [24]:
wSpec = Window.partitionBy("hour")\
                .orderBy("second")\
                .rowsBetween(0, 0)
gravity_fields_preprocess.withColumn("calSum", sum(gravity_fields_preprocess["gravity"]).over(wSpec)).show(7)

+------------+---+--------+----+------+-----+------+----+--------+
|air_pressure|day| gravity|hour|minute|month|second|year|  calSum|
+------------+---+--------+----+------+-----+------+----+--------+
|      883.14|  9| 13146.3|   7|     0|    1|     0|2015| 13146.3|
|      883.03|  9|13165.77|   7|     1|    1|     0|2015|13165.77|
|      883.03|  9|13149.97|   7|     2|    1|     0|2015|13149.97|
|      883.14|  9|13165.65|   7|     3|    1|     0|2015|13165.65|
|      883.25|  9|13165.08|   7|     4|    1|     0|2015|13165.08|
|      883.14|  9|13158.96|   7|     5|    1|     0|2015|13158.96|
|      883.37|  9|13154.79|   7|     6|    1|     0|2015|13154.79|
+------------+---+--------+----+------+-----+------+----+--------+
only showing top 7 rows



In [25]:
spark.stop()