# LIGO - Part Optional

Here we check the accuracy which with the predictions were made.

In [49]:
#Initialize the spark context and tools for processing the stored rows.
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# initialise sparkContext
spark = SparkSession.builder \
    .master('local') \
    .appName('myAppName') \
    .config('spark.executor.memory', '12gb') \
    .config("spark.cores.max", "2") \
    .getOrCreate()

sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [50]:
predict_df = spark.read.parquet('/dataset/gw_gravity_spy_dataframe_prediction')


In [51]:
parquet_df = sqlContext.read.parquet('/dataset/gw_gravity_spy_dataframe')
test_set= parquet_df.where(parquet_df['sample_type']=='test')

In [8]:
predict_df.select('prediction').where(predict_df.prediction == 1).collect()

[]

In [9]:
predict_df.select('prediction').count()

1179

In [28]:
test_set.count()

1179

In [33]:
predict_df.columns

['features', 'rawPrediction', 'prediction']

In [29]:
test_set.columns

['event_time',
 'ifo',
 'peak_time',
 'peak_time_ns',
 'start_time',
 'start_time_ns',
 'duration',
 'search',
 'process_id',
 'event_id',
 'peak_frequency',
 'central_freq',
 'bandwidth',
 'channel',
 'amplitude',
 'snr',
 'confidence',
 'chisq',
 'chisq_dof',
 'param_one_name',
 'param_one_value',
 'url1',
 'url2',
 'url3',
 'url4',
 'png',
 'gravityspy_id',
 'label',
 'sample_type']

In [52]:
import pyspark.sql.functions as F

test = test_set.select("label","png")

print(test.columns)

#Reshaping the labels as "Chrip" = "True" and all the others as "False"; Chrips are Gravitational Waves.
result = test.where(test.label == "Chirp")
print("Gravitational Waves: {0}".format(result.count()))

test = test.withColumn('gw', (test.label == "Chirp"))
test = test.drop("label")

test = test.withColumn('features_original', test.png)
test = test.withColumn('label', test.gw)
import pyspark.sql.functions as sf

test = test.drop("png")
test = test.drop("gw")
test_df = test.withColumn('label', (F.col('label') == True).cast('integer'))

print(test.columns)

result = test.where(test.label == 1)
print("Gravitational Waves (after): {0}".format(result.count()))

['label', 'png']
Gravitational Waves: 10
['features_original', 'label']
Gravitational Waves (after): 10


In [53]:
#Here we must add the 'label' part to each prediction
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql import Window

# since there is no common column between these two dataframes add row_index so that it can be joined
predict_df=predict_df.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
test_df=test_df.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

predict_df = predict_df.join(test_df, on=["row_index"]).drop("row_index")
#predict_df.show()

#predict_df = predict_df.withColumn("label", test_df.label)
predict_df.columns

['features', 'rawPrediction', 'prediction', 'features_original', 'label']

In [54]:
predict_df.drop('features_original')

DataFrame[features: vector, rawPrediction: vector, prediction: double, label: int]

In [55]:
#we add the model to the context, just in case we might need it.
from pyspark.ml.classification import LinearSVCModel

lsvcModel =  LinearSVCModel.load("/app/gw/lsvc.model")

In [57]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluation = evaluator.evaluate(predict_df)

print("area under ROC curve: %f" % evaluation)

area under ROC curve: 0.772797


In [48]:
spark.stop()