# LIGO - Gravitational Waves simple classification - Part 2
Here we take the parquet dataframe stored, we load up in memory and we try to classify it is describen in *"LIGO - Loading of the training dataset.ipynb"* 

In [1]:
#Initialize the spark context and tools for processing the stored rows.
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

#Going into spark context to load it all from a parquet format.
spark = SparkSession.builder.appName("pyspark-gw").getOrCreate()

In [2]:
# This tests if we can re load it the first few rows... 
parquet_df = spark.read.parquet('/dataset/gw_gravity_spy_dataframe_test')
parquet_df.head()

Row(event_time=1127316933.86035, ifo='H1', peak_time=1127316933, peak_time_ns=860351085, start_time=1127316933, start_time_ns=836914062, duration=0.6152300238609311, search='Omicron', process_id=0, event_id=214, peak_frequency=1181.22387695312, central_freq=1242.05822753906, bandwidth=2405.51831054688, channel='GDS-CALIB_STRAIN', amplitude=1.8502000469038799e-22, snr=11.189049720764197, confidence=0, chisq=0, chisq_dof=0, param_one_name='phase', param_one_value=-0.22053000000000003, url1='https://panoptes-uploads.zooniverse.org/production/subject_location/5b5bc81b-feba-4da6-94b0-97f24fe2d167.png', url2='https://panoptes-uploads.zooniverse.org/production/subject_location/285db5f3-5a63-4f47-a958-1c453760e358.png', url3='https://panoptes-uploads.zooniverse.org/production/subject_location/e657905e-b86a-4c28-bad3-73d219c379f9.png', url4='https://panoptes-uploads.zooniverse.org/production/subject_location/f2f9ad17-803b-446d-9a9c-d913c90cf8de.png', png=[0.09503280371427536, 0.1211226060986518

In [3]:
# initialise sparkContext
spark = SparkSession.builder \
    .master('local') \
    .appName('myAppName') \
    .config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

sc = spark.sparkContext

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [4]:
# to read parquet file (full) # TODO - Research a way to read ONLY a given partition.
parquet_df = spark.read.parquet('/dataset/gw_gravity_spy_dataframe')


In [6]:
#filter the training set
train_set= parquet_df.where(parquet_df['sample_type']=='train')

In [7]:
train_set.head()

Row(event_time=1127090580.98828, ifo='H1', peak_time=1127090580, peak_time_ns=988281011, start_time=1127090579, start_time_ns=964843034, duration=2.61718988418579, search='Omicron', process_id=0, event_id=21, peak_frequency=72.04364013671879, central_freq=768.674072265625, bandwidth=1473.34814453125, channel='GDS-CALIB_STRAIN', amplitude=6.971719934821831e-22, snr=14.190320014953599, confidence=0, chisq=0, chisq_dof=0, param_one_name='phase', param_one_value=-1.45046, url1='https://panoptes-uploads.zooniverse.org/production/subject_location/a9743238-4d2d-481b-99b6-6eae60bff351.png', url2='https://panoptes-uploads.zooniverse.org/production/subject_location/faa65ed6-7ab0-4dba-b188-3344c877ebed.png', url3='https://panoptes-uploads.zooniverse.org/production/subject_location/38c6d926-e778-4dc2-944b-2690c4be41ab.png', url4='https://panoptes-uploads.zooniverse.org/production/subject_location/f6f526f0-69bb-46ac-a96a-debeb485ce66.png', png=[0.21848233044147491, 0.22763684391975403, 0.2653347551

In [10]:
train_set.schema

StructType(List(StructField(event_time,DoubleType,true),StructField(ifo,StringType,true),StructField(peak_time,LongType,true),StructField(peak_time_ns,LongType,true),StructField(start_time,LongType,true),StructField(start_time_ns,LongType,true),StructField(duration,DoubleType,true),StructField(search,StringType,true),StructField(process_id,LongType,true),StructField(event_id,LongType,true),StructField(peak_frequency,DoubleType,true),StructField(central_freq,DoubleType,true),StructField(bandwidth,DoubleType,true),StructField(channel,StringType,true),StructField(amplitude,DoubleType,true),StructField(snr,DoubleType,true),StructField(confidence,LongType,true),StructField(chisq,LongType,true),StructField(chisq_dof,LongType,true),StructField(param_one_name,StringType,true),StructField(param_one_value,DoubleType,true),StructField(url1,StringType,true),StructField(url2,StringType,true),StructField(url3,StringType,true),StructField(url4,StringType,true),StructField(png,ArrayType(DoubleType,tru

In [15]:
#Verify that the sample type is train (just the head...)
train_set.select("sample_type").show()

+-----------+
|sample_type|
+-----------+
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
|      train|
+-----------+
only showing top 20 rows



In [16]:
train_set.count()

5587

In [37]:
# Extract and prepare the df to be processed with just the due columns. In this case, we want just the image.
# (and a way to associate it with the rest of the columns after classification)
train = train_set.select("label","png")

print(train.columns)


#Reshaping the labels as "Chrip" = "True" and all the others as "False"; Chrips are Gravitational Waves.
result = train.where(train.label == "Chirp")
print("Gravitational Waves: {0}".format(result.count()))

train = train.withColumn('gw', (train.label == "Chirp"))
train = train.drop("label")

train = train.withColumn('features', train.png)
train = train.withColumn('label', train.gw)

train = train.drop("png")
train = train.drop("gw")

print(train.columns)

result = train.where(train.label == True)
print("Gravitational Waves (after): {0}".format(result.count()))


['label', 'png']
Gravitational Waves: 41
['features', 'label']
Gravitational Waves (after): 41


# Training a classification model


In [33]:
# Linear Support Vector Machine

from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(train)

# Print the coefficients and intercept for linearsSVC
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: png, gw'

In [None]:
#Non-linear support vector machine (But not Spark ML, either).
from sklearn.svm import SVC
svc = SVC(kernel="rbf", random_state=0, gamma=1, C=1)
model = svc.fit(features, target)

In [None]:
spark.stop()