First we need to import X_dataset.csv and provide labels to the dataset

In [2]:
import pyspark
import pandas as pd
import matplotlib.pyplot as plt

headers = ['class', 'largestSpot', 'spotDistribution', 'activity', 'evolution', 'previousActivity', 'complex',
           'complexOnPath', 'area', 'largestSpotArea', 'c-class', 'm-class', 'x-class']

df = pd.read_csv('SolarFlare_Clean.csv', header=None, names=headers)

df.head()

Unnamed: 0,class,largestSpot,spotDistribution,activity,evolution,previousActivity,complex,complexOnPath,area,largestSpotArea,c-class,m-class,x-class
0,H,A,X,1,3,1,1,1,1,1,0,0,0
1,D,R,O,1,3,1,1,2,1,1,0,0,0
2,C,S,O,1,3,1,1,2,1,1,0,0,0
3,H,R,X,1,2,1,1,1,1,1,0,0,0
4,H,S,X,1,1,1,1,2,1,1,0,0,0


Before running any analysis on the data we want to remove any duplicated/noisy data

In [4]:
print("Number of duplicated rows: {}.".format(df.duplicated().sum()))

Number of duplicated rows: 701.


In [5]:
dp = df[df.duplicated(keep=False)]
df.drop_duplicates(inplace= True)
print("Number of duplicated rows: {}.".format(df.duplicated().sum()))

Number of duplicated rows: 0.


We want to create a 5 number summary of the dataset

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
activity,365.0,1.361644,0.481136,1.0,1.0,1.0,2.0,2.0
evolution,365.0,2.378082,0.650073,1.0,2.0,2.0,3.0,3.0
previousActivity,365.0,1.164384,0.519237,1.0,1.0,1.0,1.0,3.0
complex,365.0,1.616438,0.486921,1.0,1.0,2.0,2.0,2.0
complexOnPath,365.0,1.934247,0.248191,1.0,2.0,2.0,2.0,2.0
area,365.0,1.068493,0.252937,1.0,1.0,1.0,1.0,2.0
largestSpotArea,365.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
c-class,365.0,0.775342,1.257481,0.0,0.0,0.0,1.0,8.0
m-class,365.0,0.131507,0.501886,0.0,0.0,0.0,0.0,5.0
x-class,365.0,0.016438,0.147333,0.0,0.0,0.0,0.0,2.0


Now because the standard deviation of largestSpotArea is 0 we will drop it because it adds no significance to the dataset

In [7]:
df.drop(['largestSpotArea'] , axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,class,largestSpot,spotDistribution,activity,evolution,previousActivity,complex,complexOnPath,area,c-class,m-class,x-class
0,H,A,X,1,3,1,1,1,1,0,0,0
1,D,R,O,1,3,1,1,2,1,0,0,0
2,C,S,O,1,3,1,1,2,1,0,0,0
3,H,R,X,1,2,1,1,1,1,0,0,0
4,H,S,X,1,1,1,1,2,1,0,0,0


Now running full correlation analysis on all classes

In [10]:
df.corr(numeric_only=True)

Unnamed: 0,activity,evolution,previousActivity,complex,complexOnPath,area,c-class,m-class,x-class
activity,1.0,0.053519,0.377204,0.230195,0.153669,0.179668,0.116492,0.041425,0.109683
evolution,0.053519,1.0,-0.005575,-0.130782,-0.015768,0.126111,0.100831,0.12506,0.049666
previousActivity,0.377204,-0.005575,1.0,0.152275,0.084104,0.206888,0.073546,0.096034,0.108226
complex,0.230195,-0.130782,0.152275,1.0,0.177192,0.213896,0.065274,0.060829,0.088131
complexOnPath,0.153669,-0.015768,0.084104,0.177192,1.0,0.071938,0.110985,0.047554,0.02964
area,0.179668,0.126111,0.206888,0.213896,0.071938,1.0,0.083061,0.231828,0.338304
c-class,0.116492,0.100831,0.073546,0.065274,0.110985,0.083061,1.0,0.077413,-0.009669
m-class,0.041425,0.12506,0.096034,0.060829,0.047554,0.231828,0.077413,1.0,0.41652
x-class,0.109683,0.049666,0.108226,0.088131,0.02964,0.338304,-0.009669,0.41652,1.0
