In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=b0db8e8b76b8321bce750dc3eda10fa2800049c9b942f1a3c8355875c872d736
  Stored in directory: /root/.cache/pip/wheels/b1/59/a0/a1a0624b5e865fd389919c1a10f53aec9b12195d6747710baf
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

headers = ['class', 'largestSpot', 'spotDistribution', 'activity', 'evolution', 'previousActivity', 'complex',
           'complexOnPath', 'area', 'largestSpotArea', 'c-class', 'm-class', 'x-class']

df = pd.read_csv('SolarFlare_Clean.csv', header=None, names=headers)

df.head()

Unnamed: 0,class,largestSpot,spotDistribution,activity,evolution,previousActivity,complex,complexOnPath,area,largestSpotArea,c-class,m-class,x-class
0,H,A,X,1,3,1,1,1,1,1,0,0,0
1,D,R,O,1,3,1,1,2,1,1,0,0,0
2,C,S,O,1,3,1,1,2,1,1,0,0,0
3,H,R,X,1,2,1,1,1,1,1,0,0,0
4,H,S,X,1,1,1,1,2,1,1,0,0,0


In [3]:
# Class: (A,B,C,D,E,F,H) (0,1,2,3,4,5,6)
class_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'H': 6, }
df['class'] = df['class'].map(class_map)

# Largest Spot: (X,R,S,A,H,K) (0,1,2,3,4,5)
largest_spot_map = {'X': 0, 'R': 1, 'S': 2, 'A': 3, 'H': 4, 'K': 5}
df['largestSpot'] = df['largestSpot'].map(largest_spot_map)

# Spot Distribution: (X,O,I,C) (0,1,2,3)
spot_distribution_map = {'X': 0, 'O': 1, 'I': 2, 'C': 3}
df['spotDistribution'] = df['spotDistribution'].map(spot_distribution_map)

df.head()

Unnamed: 0,class,largestSpot,spotDistribution,activity,evolution,previousActivity,complex,complexOnPath,area,largestSpotArea,c-class,m-class,x-class
0,6,3,0,1,3,1,1,1,1,1,0,0,0
1,3,1,1,1,3,1,1,2,1,1,0,0,0
2,2,2,1,1,3,1,1,2,1,1,0,0,0
3,6,1,0,1,2,1,1,1,1,1,0,0,0
4,6,2,0,1,1,1,1,2,1,1,0,0,0


In [4]:

# All class correlation analysis.
df.corr()

Unnamed: 0,class,largestSpot,spotDistribution,activity,evolution,previousActivity,complex,complexOnPath,area,largestSpotArea,c-class,m-class,x-class
class,1.0,0.38162,-0.533355,-0.01681,-0.172714,-0.008798,0.108333,-0.485727,0.064932,,-0.008419,-0.014083,0.013149
largestSpot,0.38162,1.0,0.270772,0.324559,0.025465,0.215233,0.428767,-0.065179,0.366864,,0.274016,0.167544,0.150191
spotDistribution,-0.533355,0.270772,1.0,0.38979,0.19185,0.204442,0.302026,0.451694,0.33303,,0.309831,0.195259,0.150964
activity,-0.01681,0.324559,0.38979,1.0,0.000645,0.400409,0.27916,0.145256,0.245672,,0.260717,0.114317,0.122619
evolution,-0.172714,0.025465,0.19185,0.000645,1.0,-0.015562,-0.160388,0.029288,0.068912,,0.057061,0.074575,0.027833
previousActivity,-0.008798,0.215233,0.204442,0.400409,-0.015562,1.0,0.164349,0.069692,0.268607,,0.158136,0.13596,0.123554
complex,0.108333,0.428767,0.302026,0.27916,-0.160388,0.164349,1.0,0.073891,0.195669,,0.168451,0.099685,0.07903
complexOnPath,-0.485727,-0.065179,0.451694,0.145256,0.029288,0.069692,0.073891,1.0,0.060864,,0.108486,0.049131,0.024583
area,0.064932,0.366864,0.33303,0.245672,0.068912,0.268607,0.195669,0.060864,1.0,,0.149335,0.251184,0.334833
largestSpotArea,,,,,,,,,,,,,


In [5]:
conf = SparkConf().set("spark.ui.port", "4050")

sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()


In [6]:
from pyspark.ml.feature import VectorAssembler
py_df = spark.createDataFrame(df)

In [7]:
df.columns

Index(['class', 'largestSpot', 'spotDistribution', 'activity', 'evolution',
       'previousActivity', 'complex', 'complexOnPath', 'area',
       'largestSpotArea', 'c-class', 'm-class', 'x-class'],
      dtype='object')

In [9]:
df_assembler = VectorAssembler(inputCols=['class', 'largestSpot', 'spotDistribution', 'activity', 'evolution',
       'previousActivity', 'complex', 'complexOnPath', 'area',
       'largestSpotArea'], outputCol="features")
df_output = df_assembler.transform(py_df)

df_output.show()

+-----+-----------+----------------+--------+---------+----------------+-------+-------------+----+---------------+-------+-------+-------+--------------------+
|class|largestSpot|spotDistribution|activity|evolution|previousActivity|complex|complexOnPath|area|largestSpotArea|c-class|m-class|x-class|            features|
+-----+-----------+----------------+--------+---------+----------------+-------+-------------+----+---------------+-------+-------+-------+--------------------+
|    6|          3|               0|       1|        3|               1|      1|            1|   1|              1|      0|      0|      0|[6.0,3.0,0.0,1.0,...|
|    3|          1|               1|       1|        3|               1|      1|            2|   1|              1|      0|      0|      0|[3.0,1.0,1.0,1.0,...|
|    2|          2|               1|       1|        3|               1|      1|            2|   1|              1|      0|      0|      0|[2.0,2.0,1.0,1.0,...|
|    6|          1|               

In [10]:
df_output.select("features", "m-class").show()

+--------------------+-------+
|            features|m-class|
+--------------------+-------+
|[6.0,3.0,0.0,1.0,...|      0|
|[3.0,1.0,1.0,1.0,...|      0|
|[2.0,2.0,1.0,1.0,...|      0|
|[6.0,1.0,0.0,1.0,...|      0|
|[6.0,2.0,0.0,1.0,...|      0|
|[2.0,3.0,1.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[2.0,3.0,1.0,1.0,...|      0|
|[2.0,3.0,1.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[2.0,3.0,1.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[1.0,0.0,2.0,1.0,...|      0|
|[2.0,2.0,1.0,2.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[3.0,1.0,2.0,1.0,...|      0|
|[6.0,2.0,0.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
|[1.0,0.0,1.0,1.0,...|      0|
+--------------------+-------+
only showing top 20 rows



In [11]:
mClass_model = df_output.select("features", "m-class")
M_trainingData, M_testData = mClass_model.randomSplit([0.7, 0.3])

xClass_model = df_output.select("features", "x-class")
X_trainingData, X_testData = xClass_model.randomSplit([0.7, 0.3])

cClass_model = df_output.select("features", "c-class")
C_trainingData, C_testData = cClass_model.randomSplit([0.7, 0.3])

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [13]:
M_classifier = DecisionTreeClassifier(labelCol="m-class").fit(M_trainingData)
M_prediction = M_classifier.transform(M_testData)

X_classifier = DecisionTreeClassifier(labelCol="x-class").fit(X_trainingData)
X_prediction = X_classifier.transform(X_testData)

C_classifier = DecisionTreeClassifier(labelCol="c-class").fit(C_trainingData)
C_prediction = C_classifier.transform(C_testData)

M_prediction.show()

+--------------------+-------+--------------------+--------------------+----------+
|            features|m-class|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|       0.0|
|[1.0,0.0,1.0,1.0,...|      0|[695.0,19.0,0.0,0.0]|[0.97338935574229...|    

In [14]:
M_accuracy = MulticlassClassificationEvaluator(labelCol="m-class", metricName="accuracy").evaluate(M_prediction)
M_accuracy

0.963963963963964

In [15]:
X_accuracy = MulticlassClassificationEvaluator(labelCol="x-class", metricName="accuracy").evaluate(X_prediction)
X_accuracy

1.0

In [16]:
C_accuracy = MulticlassClassificationEvaluator(labelCol="c-class", metricName="accuracy").evaluate(C_prediction)
C_accuracy

0.7942122186495176