In [2]:
### Important ###
### Run this cell if you are using Google Colab

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
print('[Done] openjdk8 has been installed')

!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz -C /usr/lib/
!rm spark-2.4.5-bin-hadoop2.7.tgz
print('[Done] Spark has been installed')

!pip install -q findspark
print('[Done] findspark has been installed')

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/lib/spark-2.4.5-bin-hadoop2.7"
print('[Done] Environment variables has been set')

import findspark
findspark.init()
print('[Done] You can use pyspark now.')

[Done] openjdk8 has been installed
[Done] Spark has been installed
[Done] findspark has been installed
[Done] Environment variables has been set
[Done] You can use pyspark now.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
# LOAD DATA

In [0]:
df_train = spark.read.csv('Planet_Training.csv',inferSchema=True, header=True)

In [0]:
df_test = spark.read.csv('Planet_Testing.csv',inferSchema=True, header=True)

In [8]:
df_train.show()

+------------+-----------+------+------+-------+------+----------------+-----+---------+
|        Name|Temperature|Carbon| Water|Rhodium|  Iron|Atmosphere Color|Moons|Habitable|
+------------+-----------+------+------+-------+------+----------------+-----+---------+
|    HD 9827b|     318521|  High|   Low|   High|   Low|            null|    7|        1|
|Gliese 6797u|     323488|   Low|Medium|    Low|   Low|          Yellow|    8|        1|
|  WASP-5812f|     319279|   Low|   Low|    Low|   Low|          Yellow|    0|        1|
|    HD 1310e|     315375|   Low|   Low|    Low|   Low|          Yellow|    6|        1|
|    HR 3976s|     302312|   Low|Medium| Medium|  High|          Yellow|    7|        1|
|     K2-958x|     329687|   Low|   Low| Medium|   Low|          Yellow|    1|        1|
|    HR 2195b|     265746|   Low|  High|    Low|Medium|             Red|    3|        0|
|  Ross 5664m|     305214|   Low|  High|    Low|   Low|          Yellow|    5|        1|
|    HR 3316t|     29

In [0]:
# 2. SELECT FEATURES

In [0]:
df_train = df_train.select('Temperature','Water','Atmosphere Color','Habitable')
df_test = df_test.select('Temperature','Water','Atmosphere Color','Habitable')

In [0]:
# 3. DATA PREPORCESSING

In [0]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [0]:
# 4. TRANSFORM DATA

In [0]:
df_train = df_train.withColumn("Water", when(df_train["Water"] == "High", 2).
                                        when(df_train["Water"] == "Medium", 1).
                                        when(df_train["Water"] == "Low", 0))
df_train = df_train.withColumn("Atmosphere Color",  when(df_train["Atmosphere Color"] == "Yellow", 2).
                                                    when(df_train["Atmosphere Color"] == "Blue", 1).
                                                    when(df_train["Atmosphere Color"] == "Red", 0))

In [0]:
df_test = df_test.withColumn("Water", when(df_test["Water"] == "High", 2).
                                      when(df_test["Water"] == "Medium", 1).
                                      when(df_test["Water"] == "Low", 0))
df_test = df_test.withColumn("Atmosphere Color",  when(df_test["Atmosphere Color"] == "Yellow", 2).
                                                  when(df_test["Atmosphere Color"] == "Blue", 1).
                                                  when(df_test["Atmosphere Color"] == "Red", 0))

In [0]:
cols = df_train.columns
cols.remove("Habitable")
df_train = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df_train)

In [0]:
cols = df_test.columns
cols.remove("Habitable")
df_test = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df_test)

In [0]:
# 5. NORMALIZATION

In [0]:
scaler = StandardScaler(
    inputCol = 'Features',
    outputCol = 'Features_normalized'
)

In [0]:
df_train = scaler.fit(df_train).transform((df_train))
df_test = scaler.fit(df_test).transform((df_test))

In [0]:
# 6. GENERATE MODEL

In [0]:
model = LogisticRegression(featuresCol = "Features_normalized", 
                           labelCol = "Habitable", 
                           maxIter = 10).fit(df_train)

In [0]:
# 7. MODEL TESTING AND EVALUATION

In [19]:
prediction = model.transform(df_test)
prediction.show()

+-----------+-----+----------------+---------+------------------+--------------------+--------------------+--------------------+----------+
|Temperature|Water|Atmosphere Color|Habitable|          Features| Features_normalized|       rawPrediction|         probability|prediction|
+-----------+-----+----------------+---------+------------------+--------------------+--------------------+--------------------+----------+
|     325145|    2|               2|        1|[325145.0,2.0,2.0]|[16.6313339613367...|[-0.5960950727923...|[0.35523758686965...|       1.0|
|     269079|    1|               0|        0|[269079.0,1.0,0.0]|[13.7635292284443...|[2.99908514599944...|[0.95253277965808...|       0.0|
|     302996|    2|               2|        1|[302996.0,2.0,2.0]|[15.4984012208374...|[-0.2122758672718...|[0.44712941787447...|       1.0|
|     312604|    2|               2|        1|[312604.0,2.0,2.0]|[15.9898553619146...|[-0.3787725455255...|[0.40642297823869...|       1.0|
|     280875|    0| 

In [0]:
evaluator = BinaryClassificationEvaluator(
    rawPredictionCol = 'rawPrediction',
    labelCol = 'Habitable'
)

In [21]:
accuracy = evaluator.evaluate(prediction)*100
accuracy = round(accuracy, 2)
print(f'Accuracy: {accuracy}%')

Accuracy: 91.71%
