In [310]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DoubleType, StructType, StructField
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, max, min, col, round, sum, row_number, split, stddev

In [311]:
spark = SparkSession\
    .builder\
    .appName("Python Spark SQL basic example")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [312]:
df = spark.read.text('sunspots.txt')
df = df.select(col('value').alias('val'))

In [313]:
df = df.withColumn('index', split(df.val, '\t')[0])
df = df.withColumn('value', split(df.val, '\t')[1])
df = df.withColumn('index', df['index'].cast(IntegerType()))
df = df.withColumn('value', df['value'].cast(DoubleType()))
df = df.select('index', 'value')
df.show()

+-----+-----+
|index|value|
+-----+-----+
|    0| 58.0|
|    1| 62.6|
|    2| 70.0|
|    3| 55.7|
|    4| 85.0|
|    5| 83.5|
|    6| 94.8|
|    7| 66.3|
|    8| 75.9|
|    9| 75.5|
|   10|158.6|
|   11| 85.2|
|   12| 73.3|
|   13| 75.9|
|   14| 89.2|
|   15| 88.3|
|   16| 90.0|
|   17|100.0|
|   18| 85.4|
|   19|103.0|
+-----+-----+
only showing top 20 rows



In [314]:
w = Window.orderBy(df.index).rangeBetween(-2,2)
avg = df.withColumn('avg', avg('value').over(w))
avg.show()

+-----+-----+-----------------+
|index|value|              avg|
+-----+-----+-----------------+
|    0| 58.0|63.53333333333333|
|    1| 62.6|           61.575|
|    2| 70.0|            66.26|
|    3| 55.7|            71.36|
|    4| 85.0|             77.8|
|    5| 83.5|            77.06|
|    6| 94.8|             81.1|
|    7| 66.3|             79.2|
|    8| 75.9|            94.22|
|    9| 75.5|92.29999999999998|
|   10|158.6|             93.7|
|   11| 85.2|             93.7|
|   12| 73.3|            96.44|
|   13| 75.9|82.38000000000001|
|   14| 89.2|            83.34|
|   15| 88.3|            88.68|
|   16| 90.0|            90.58|
|   17|100.0|            93.34|
|   18| 85.4|93.91999999999999|
|   19|103.0|89.05999999999999|
+-----+-----+-----------------+
only showing top 20 rows



In [315]:
residual = avg.select('index', 'value', 'avg', (col('value') - col('avg')).alias('residual'))
residual = residual.select('index', 'value', 'avg', (round(col('residual'), 2)).alias('residual'))
residual.show()

+-----+-----+-----------------+--------+
|index|value|              avg|residual|
+-----+-----+-----------------+--------+
|    0| 58.0|63.53333333333333|   -5.53|
|    1| 62.6|           61.575|    1.02|
|    2| 70.0|            66.26|    3.74|
|    3| 55.7|            71.36|  -15.66|
|    4| 85.0|             77.8|     7.2|
|    5| 83.5|            77.06|    6.44|
|    6| 94.8|             81.1|    13.7|
|    7| 66.3|             79.2|   -12.9|
|    8| 75.9|            94.22|  -18.32|
|    9| 75.5|92.29999999999998|   -16.8|
|   10|158.6|             93.7|    64.9|
|   11| 85.2|             93.7|    -8.5|
|   12| 73.3|            96.44|  -23.14|
|   13| 75.9|82.38000000000001|   -6.48|
|   14| 89.2|            83.34|    5.86|
|   15| 88.3|            88.68|   -0.38|
|   16| 90.0|            90.58|   -0.58|
|   17|100.0|            93.34|    6.66|
|   18| 85.4|93.91999999999999|   -8.52|
|   19|103.0|89.05999999999999|   13.94|
+-----+-----+-----------------+--------+
only showing top

In [316]:
df_stats = residual.select(stddev(col('residual')).alias('std')).collect()
std = df_stats[0]['std']
print(std)

11.417362025292233


In [317]:
anamolies = residual.select('index', 'value').where((col('value') > (col('avg') + (std*3))) | (col('value') < (col('avg') - (std*3))))
anamolies.count()

46

In [318]:
anamolies.show(46)

+-----+-----+
|index|value|
+-----+-----+
|   10|158.6|
|  253|142.5|
|  255| 51.0|
|  262|132.2|
|  264| 36.0|
|  268|152.7|
|  278| 31.1|
|  349|109.3|
|  352|238.9|
|  361|165.7|
|  818| 96.2|
|  986| 93.4|
| 1055|206.3|
| 1069| 84.8|
| 1172|107.1|
| 1182| 52.2|
| 1185|180.4|
| 1464| 88.3|
| 1735|129.2|
| 1882|107.2|
| 1890|103.6|
| 1897|108.2|
| 2023|154.5|
| 2025| 72.2|
| 2027|129.3|
| 2171|108.0|
| 2274|165.3|
| 2380|201.3|
| 2390| 94.8|
| 2391|189.7|
| 2503|158.0|
| 2518|152.3|
| 2520|217.4|
| 2521|143.1|
| 2527|199.6|
| 2755| 58.1|
| 2756|138.2|
| 2789| 90.9|
| 2807|127.0|
| 2885|196.2|
| 2886|126.9|
| 2897|105.4|
| 2899|200.3|
| 3005|137.7|
| 3018|170.1|
| 3032|150.7|
+-----+-----+

