In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

In [2]:
df = pd.read_csv("working_csv.csv")

In [3]:
df

Unnamed: 0,P. Name,P. Zone Class,P. Mass Class,P. Composition Class,P. Atmosphere Class,P. Habitable Class,P. Density (EU),P. Gravity (EU),P. Esc Vel (EU),P. SFlux Min (EU),...,P. HZD,P. HZC,P. HZA,P. HZI,P. SPH,P. ESI,S. HabCat,P. Habitable,P. Hab Moon,P. Confirmed
0,GJ 163 c,Warm,Superterran,rocky-iron,metals-rich,psychroplanet,1.19,2.17,1.99,0.551557,...,-0.34,-0.14,0.81,0.53,0.08,0.73,0,1,0,1
1,GJ 667 C e,Warm,Terran,rocky-iron,metals-rich,psychroplanet,0.99,1.39,1.39,0.290249,...,0.51,-0.16,0.23,0.63,0.0,0.6,0,1,0,0
2,GJ 667 C f,Warm,Terran,rocky-iron,metals-rich,psychroplanet,0.99,1.39,1.39,0.530653,...,-0.22,-0.16,0.08,0.78,0.0,0.77,0,1,0,0
3,GJ 682 c,Warm,Superterran,rocky-iron,hydrogen-rich,psychroplanet,1.24,2.37,2.13,0.30206,...,0.23,-0.14,1.19,0.45,0.0,0.59,0,1,0,0
4,GJ 3293 d,Warm,Superterran,rocky-iron,metals-rich,psychroplanet,1.2,2.22,2.03,0.447687,...,-0.21,-0.14,0.91,0.52,0.0,0.7,0,1,0,1
5,HD 40307 g,Warm,Superterran,rocky-iron,metals-rich,psychroplanet,1.18,2.15,1.98,0.389528,...,-0.15,-0.14,0.77,0.56,0.04,0.74,1,1,0,0
6,Kapteyn b,Warm,Terran,rocky-iron,metals-rich,psychroplanet,1.09,1.79,1.71,0.290059,...,0.12,-0.15,0.57,0.63,0.0,0.67,0,1,0,0
7,Kepler-62 f,Warm,Terran,rocky-iron,metals-rich,psychroplanet,1.0,1.41,1.41,0.388223,...,0.45,-0.16,0.19,0.66,0.0,0.67,0,1,0,1
8,Kepler-150 f,Warm,Superterran,gas,metals-rich,psychroplanet,0.19,0.68,1.57,0.492721,...,0.28,2.88,0.35,0.26,0.0,0.53,0,0,0,1
9,Kepler-174 d,Warm,Superterran,rocky-iron,hydrogen-rich,psychroplanet,1.42,3.1,2.6,0.426752,...,0.32,-0.13,1.77,0.36,0.0,0.61,0,1,0,1


In [4]:
# We did univariate feature selection and picked the top 20 attributes/features that contribute towards the 
# classification of that particular datapoint. The below cell is to remove the other attributes and keep only the 
# top 20 we picked.

In [5]:
cols = df.columns
col_list = ["P. Habitable Class","P. Teq Max (K)","S. Mag from Planet","P. SFlux Mean (EU)","S. Teff (K)","P. Mag","P. HZD","P. ESI","P. SPH","P. HZC","P. Gravity (EU)","S. Size from Planet (deg)","S. [Fe/H]","S. DEC (deg)","P. SFlux Min (EU)","S. Hab Zone Max (AU)","P. HZI","P. Eccentricity","P. Appar Size (deg)"]
for words in cols:
    if words not in col_list:
        df.drop(columns = [words] , inplace = True)

In [6]:
print(len(df.columns))

19


In [7]:
df = df.sample(frac=1)
y = df["P. Habitable Class"]
df, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

print (df.shape, y_train.shape)
print (X_test.shape, y_test.shape)

#model.score(X_test, y_test)

(43, 19) (43,)
(11, 19) (11,)


In [8]:
# We are getting dummy variable and performing one hot encoding for the target column becuase we can't 
# process strings.
def create_one_hot(arr):
    return pd.get_dummies(arr)

c = df[["P. Habitable Class"]]
one_hot = create_one_hot(c)
df = df.drop("P. Habitable Class",axis = 1)
df = df.join(one_hot)
print(len(df.columns))
df

21


Unnamed: 0,P. Gravity (EU),P. SFlux Min (EU),P. SFlux Mean (EU),P. Teq Max (K),P. Mag,P. Appar Size (deg),P. Eccentricity,S. Teff (K),S. [Fe/H],S. DEC (deg),...,S. Size from Planet (deg),S. Hab Zone Max (AU),P. HZD,P. HZC,P. HZI,P. SPH,P. ESI,P. Habitable Class_mesoplanet,P. Habitable Class_non-habitable,P. Habitable Class_psychroplanet
14,3.23,0.234181,0.407107,238.9,-17.03,2.72,0.29,3131.0,-0.24,-15.2714,...,1.1837,0.118,0.18,-0.82,0.41,0.0,0.68,0,0,1
20,2.14,0.898436,0.898436,248.0,-18.36,3.45,0.0,4351.0,-0.26,47.839,...,0.8908,0.646,-0.58,-0.14,0.52,0.85,0.79,1,0,0
41,0.62,24.37481,24.37481,566.1,-23.17,6.03,0.0,5208.0,0.052,44.8581,...,3.2389,1.193,-2.06,2.6,0.23,0.0,0.25,0,1,0
2,1.39,0.530653,0.563207,224.1,-17.28,2.65,0.03,3350.0,-0.55,-34.9897,...,1.19,0.251,-0.22,-0.16,0.78,0.0,0.77,0,0,1
45,2.29,9.648586,9.648587,449.0,-21.03,3.58,0.0,5448.0,-0.060271,48.8318,...,1.862,1.392,-1.84,-0.14,0.35,0.0,0.39,0,1,0
35,2.04,0.658489,0.785473,251.2,-18.16,3.36,0.09,3371.0,-0.060271,-17.7734,...,1.3905,0.244,-0.53,-0.14,0.54,0.42,0.77,1,0,0
28,2.26,1.38738,1.38738,276.5,-18.9,3.56,0.0,5710.0,-0.060271,48.5244,...,0.6427,1.65,-0.81,-0.14,0.49,0.02,0.76,1,0,0
21,2.86,0.832599,0.832599,243.4,-18.6,4.0,0.0,3722.0,-0.060271,41.8121,...,1.1719,0.443,-0.56,-0.13,0.41,0.79,0.74,1,0,0
0,2.17,0.551557,0.663002,241.3,-18.05,3.48,0.09,3840.0,-0.060271,-53.3736,...,0.9881,0.212,-0.34,-0.14,0.53,0.08,0.73,0,0,1
27,2.47,1.146929,1.146929,263.7,-18.79,3.72,0.0,4655.0,0.15,26.15,...,0.8793,0.836,-0.76,-0.14,0.46,0.65,0.77,1,0,0


In [9]:
one_hot_labels = df[["P. Habitable Class_mesoplanet" , "P. Habitable Class_non-habitable" , "P. Habitable Class_psychroplanet"]]
df.drop(columns = ["P. Habitable Class_mesoplanet" , "P. Habitable Class_non-habitable" , "P. Habitable Class_psychroplanet"] , inplace = True)

# Dropping the categorical target column and replacing with a one hot encoded columns.

In [10]:
# This cell is to change the pandas dataframe into a matrix (numpy n-dimensional array) of the same dimension
# so that we can use this feature matrix to perform calculation/computation.

# The no of columns in the feature matrix = no of attributes we are considering.
# no of rows = no of datapoints in our balanced dataset.( 54 x 18)

# The original dataset had about 3.6K rows in which the class label was non-habitable , 31 data points belonging
# to the mesoplanet and 18 data points belonging to the psychroplanet class. So if we used the original data , 
# our model would be very biased becuase data is very skewed. So we had to balance it out and psychroplanet being
# the bottleneck class , we kept all the 18 samples of psychroplanet , random sampled 18 samples from mesoplanet
# and non-habitable class to come up with this balanced dataset.


def create_nparray(df):    
    Row_list =[]   
    # Iterate over each row 
    for index, rows in df.iterrows(): 
        # Create list for the current row 
        my_list =[rows["P. Teq Max (K)"],rows["S. Mag from Planet"],rows["P. SFlux Mean (EU)"],rows["S. Teff (K)"],rows["P. Mag"],rows["P. HZD"],rows["P. ESI"],rows["P. SPH"],rows["P. HZC"],rows["P. Gravity (EU)"],rows["S. Size from Planet (deg)"],rows["S. [Fe/H]"],rows["S. DEC (deg)"],rows["P. SFlux Min (EU)"],rows["S. Hab Zone Max (AU)"],rows["P. HZI"],rows["P. Eccentricity"],rows["P. Appar Size (deg)"]] 
          
        # append the list to the final list 
        Row_list.append(my_list) 
  
    #print(Row_list)
    Row_list = np.array(Row_list)
    return Row_list

In [11]:
feature_set = create_nparray(df)
print(len(df.columns))
print(len(one_hot_labels))

#Row_list = stats.zscore(Row_list, axis=1, ddof=1)
#applying min-max normalization
#min_max_scaler = preprocessing.MinMaxScaler()
#feature_set = min_max_scaler.fit_transform(feature_set)

scaler = MinMaxScaler()
print(scaler.fit(feature_set))
feature_set = scaler.transform(feature_set)

18
43
MinMaxScaler(copy=True, feature_range=(0, 1))


In [12]:
print(feature_set)

[[3.79918589e-02 9.74683544e-01 1.35577003e-04 1.64031621e-01
  9.35956790e-01 9.04255319e-01 7.38095238e-01 0.00000000e+00
  0.00000000e+00 5.53912872e-02 5.30310885e-02 4.73282443e-01
  4.14495450e-01 0.00000000e+00 2.70926001e-03 4.61538462e-01
  8.52941176e-01 6.83324834e-02]
 [4.68113976e-02 8.73417722e-01 1.12201974e-03 5.08469791e-01
  8.33333333e-01 6.34751773e-01 8.69047619e-01 8.76288660e-01
  4.27942102e-02 3.39049872e-02 3.13402550e-02 4.58015267e-01
  9.66279403e-01 1.45752397e-03 2.40598463e-02 6.02564103e-01
  0.00000000e+00 1.05558389e-01]
 [3.55107579e-01 4.17721519e-01 4.82556172e-02 7.50423490e-01
  4.62191358e-01 1.09929078e-01 2.26190476e-01 0.00000000e+00
  2.15229704e-01 3.94244037e-03 2.05229794e-01 6.96183206e-01
  9.40216935e-01 5.29698981e-02 4.61787303e-02 2.30769231e-01
  0.00000000e+00 2.37123916e-01]
 [2.36479938e-02 9.36708861e-01 4.48978913e-04 2.25861095e-01
  9.16666667e-01 7.62411348e-01 8.45238095e-01 0.00000000e+00
  4.15355570e-02 1.91208358e-02 5

In [13]:
one_hot_labels = one_hot_labels.values


# To summarize , feature_set variable consists of our dataset matrix and one_hot_labels variable consists of the 
# target matrix or one hot encoded class label matrix.

# feature_set dimensions = 54 x 18
# one_hot_labels dimensions = 54 x 3 (Becasue 3 target classes)

In [14]:
print(feature_set.shape[0])
print(feature_set.shape[1])

43
18


In [15]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_der(x):
    return sigmoid(x) *(1-sigmoid (x))

def softmax(A):
    expA = np.exp(A)
    return expA / expA.sum(axis=1, keepdims=True)

#np.random.seed(42)# This is so that everytime we get the same random numbers.

instances = feature_set.shape[0]
attributes = feature_set.shape[1]
hidden_nodes = 4
output_labels = 3

wh = np.random.rand(attributes,hidden_nodes) # weights of hidden layer.
bh = np.random.randn(hidden_nodes) # Bias of hidden layer.

wo = np.random.rand(hidden_nodes,output_labels)# Weights of output layer
bo = np.random.randn(output_labels)# Bias of output layer.
lr = 0.0001

error_cost = []

for epoch in range(100000):
# feedforward

    # Phase 1
    zh = np.dot(feature_set, wh) + bh
    ah = sigmoid(zh)

    # Phase 2
    zo = np.dot(ah, wo) + bo
    ao = softmax(zo)

# Back Propagation

# Phase 1

    dcost_dzo = ao - one_hot_labels
    dzo_dwo = ah

    dcost_wo = np.dot(dzo_dwo.T, dcost_dzo)

    dcost_bo = dcost_dzo

# Phases 2

    dzo_dah = wo
    dcost_dah = np.dot(dcost_dzo , dzo_dah.T)
    dah_dzh = sigmoid_der(zh)
    dzh_dwh = feature_set
    dcost_wh = np.dot(dzh_dwh.T, dah_dzh * dcost_dah)

    dcost_bh = dcost_dah * dah_dzh

# Update Weights

    wh -= lr * dcost_wh
    bh -= lr * dcost_bh.sum(axis=0)

    wo -= lr * dcost_wo
    bo -= lr * dcost_bo.sum(axis=0)

    if epoch % 200 == 0:
        loss = np.sum(-one_hot_labels * np.log(ao))
        print('Loss function value: ', loss)
        error_cost.append(loss)

Loss function value:  -16848.983262242426
Loss function value:  -12629.15486725395
Loss function value:  -12017.292645187383
Loss function value:  -11963.609248198747
Loss function value:  -11957.09600370442
Loss function value:  -11953.668749678207
Loss function value:  -11950.301430686563
Loss function value:  -11946.786505230422
Loss function value:  -11943.097886725234
Loss function value:  -11939.219709170477
Loss function value:  -11935.135190976722
Loss function value:  -11930.82570747814
Loss function value:  -11926.270475243575
Loss function value:  -11921.446229994133
Loss function value:  -11916.326850180885
Loss function value:  -11910.88291220952
Loss function value:  -11905.08116275051
Loss function value:  -11898.883890237521
Loss function value:  -11892.24817331522
Loss function value:  -11885.124978514144
Loss function value:  -11877.458072495787
Loss function value:  -11869.18270543288
Loss function value:  -11860.22401098362
Loss function value:  -11850.495054299083


Loss function value:  -1927.6450050006513
Loss function value:  -1913.6267750354455
Loss function value:  -1899.7718976247247
Loss function value:  -1886.0777685589455
Loss function value:  -1872.5418440413055
Loss function value:  -1859.1616388534298
Loss function value:  -1845.9347245722472
Loss function value:  -1832.858727837261
Loss function value:  -1819.9313286674426
Loss function value:  -1807.1502588269009
Loss function value:  -1794.5133002385585
Loss function value:  -1782.0182834449915
Loss function value:  -1769.6630861156386
Loss function value:  -1757.4456315995335
Loss function value:  -1745.3638875227361
Loss function value:  -1733.4158644296276
Loss function value:  -1721.5996144672204
Loss function value:  -1709.9132301116251
Loss function value:  -1698.3548429358314
Loss function value:  -1686.92262241791
Loss function value:  -1675.6147747888053
Loss function value:  -1664.4295419188009
Loss function value:  -1653.3652002417957
Loss function value:  -1642.420059716

Loss function value:  -708.6339892656667
Loss function value:  -706.1744605064579
Loss function value:  -703.7307925544983
Loss function value:  -701.3028393754475
Loss function value:  -698.8904566394017
Loss function value:  -696.4935016974002
Loss function value:  -694.1118335582778
Loss function value:  -691.7453128658814
Loss function value:  -689.393801876628
Loss function value:  -687.0571644374083
Loss function value:  -684.7352659638345
Loss function value:  -682.4279734188055
Loss function value:  -680.1351552914183
Loss function value:  -677.8566815761847
Loss function value:  -675.5924237525733
Loss function value:  -673.3422547648646
Loss function value:  -671.1060490023051
Loss function value:  -668.8836822795765
Loss function value:  -666.6750318175448
Loss function value:  -664.479976224318
Loss function value:  -662.2983954765834
Loss function value:  -660.1301709012266
Loss function value:  -657.9751851572352
Loss function value:  -655.8333222178746
Loss function valu

In [16]:
print(wh)
print(wh.shape)
print(bh)
print(wo)
print(bo)

[[ 1.76155186  0.71206721 -0.91535667  0.07261282]
 [-0.32933011  0.30663752  1.03765798  0.80486359]
 [ 0.26927544  0.44018136 -0.1464713   0.64942587]
 [ 1.12673831  0.43368377 -2.34433461  0.50239001]
 [-0.53037396  1.00799847  1.59078352  0.38715198]
 [-3.14527754  0.72517656  2.50486568  0.8384097 ]
 [ 1.4310335   0.63376355  2.53344461  0.73937363]
 [ 5.3281207   0.30045788  1.5753173   0.37651144]
 [ 0.72060326  1.00650921 -0.85209594 -0.03272116]
 [ 0.72451358  0.58443984  0.68663175  0.1111703 ]
 [ 1.27773307  0.39054325 -0.28103498 -0.08950636]
 [ 1.83433747  0.23696191 -1.26073458 -0.2185113 ]
 [ 0.60751592  0.41553033 -0.06852643  0.0936612 ]
 [ 0.96297195  0.4532627  -0.70032955  0.28169695]
 [ 0.56275394  0.17747522  0.32502305  0.61963524]
 [-2.32028911  0.84836967  2.30622287  0.37330472]
 [-0.42478214  0.88862002  0.04681413  0.52803398]
 [ 1.65758055  0.42474263 -1.96241379  0.28989904]]
(18, 4)
[ 0.54941279  1.34785041 -2.25716084 -0.19190815]
[[ 5.00208927  3.118619

In [17]:
def test(inp_arr , wh , bh , wo , bo):
    result = sigmoid(np.dot(inp_arr, wh) + bh)
    new_result = softmax(np.dot(result , wo) + bo)
    print(new_result)
    return new_result
    
X_test = X_test.drop(columns = ["P. Habitable Class"])
X_test = create_nparray(X_test)
X_test_results = test(X_test , wh, bh , wo , bo)
y_test = create_one_hot(y_test)
y_test = y_test.values
#print("Score : " , accuracy_score(y_test,X_test_results.round() , normalize= False))

[[7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]
 [7.90065710e-03 9.92073680e-01 2.56624383e-05]]


  


In [28]:
print(type(X_test_results))
print(type(y_test))
X_test_results = X_test_results.round()
X_test_results.astype(int)
print(X_test_results)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [29]:
results = confusion_matrix(y_test, X_test_results)
#results = confusion_matrix(y_test, X_test_results.round())
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, X_test_results.round())) 
print('Report : ')
print(classification_report(y_test, X_test_results.round())) 

ValueError: multilabel-indicator is not supported

In [18]:
print(y_test)
print(X_test_results.round())
print(X_test)

[[0 0 1]
 [0 1 0]
 [1 0 0]
 [0 0 1]
 [0 0 1]
 [0 1 0]
 [1 0 0]
 [0 1 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]]
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
[[ 2.37800000e+02 -2.63000000e+01  6.99982100e-01  4.40200000e+03
  -1.74200000e+01 -3.40000000e-01  8.40000000e-01  4.00000000e-02
  -1.60000000e-01  1.30000000e+00  7.68500000e-01 -3.70000000e-01
   3.92801000e+01  6.46679900e-01  6.81000000e-01  7.30000000e-01
   4.00000000e-02  2.55000000e+00]
 [ 3.54500000e+02 -2.79000000e+01  2.86096700e+00  5.14400000e+03
  -2.15600000e+01 -1.33000000e+00  5.40000000e-01  0.00000000e+00
   3.13000000e+00  8.70000000e-01  1.14220000e+00  8.00000000e-02
  -2.70331000e+01  2.22180000e+00  1.00800000e+00  2.30000000e-01
   1.30000000e-01  8.36000000e+00]
 [ 2.66900000e+02 -2.69000000e+01  1.20485000e+00  5.32100000e+03
  -1.91500000e+01 -7.40000000e-01  7.20000000e-01  2.30000000e-01
  -1.20000000e-01  3.32000000e+00  6.