In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [2]:
zooDf=spark.read.csv("zoo.csv",header=True,inferSchema=True)

In [3]:
zooDf.printSchema()

root
 |-- AnimalName: string (nullable = true)
 |-- Hair: integer (nullable = true)
 |-- Feathers: integer (nullable = true)
 |-- Eggs: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Airborne: integer (nullable = true)
 |-- Aquatic: integer (nullable = true)
 |-- Predator: integer (nullable = true)
 |-- Toothed: integer (nullable = true)
 |-- Backbone: integer (nullable = true)
 |-- Breathes: integer (nullable = true)
 |-- Venomous: integer (nullable = true)
 |-- Fins: integer (nullable = true)
 |-- Legs: integer (nullable = true)
 |-- Tail: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Catsize: integer (nullable = true)
 |-- Type: integer (nullable = true)



In [4]:
zooDf.select("Type","AnimalName").show(5)

+----+----------+
|Type|AnimalName|
+----+----------+
|   1|  aardvark|
|   1|  antelope|
|   4|      bass|
|   1|      bear|
|   1|      boar|
+----+----------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

def IsMammal(x):
    if x==1:
        return 1
    else:
        return 0
IsMammalUDF=udf(lambda x:IsMammal(x), IntegerType())

In [6]:
IsMammalUDF(zooDf["Type"])

Column<'<lambda>(Type)'>

In [7]:
zooDf=zooDf.withColumn("IsMammal",IsMammalUDF(zooDf["Type"]))

In [8]:
zooDf.printSchema()

root
 |-- AnimalName: string (nullable = true)
 |-- Hair: integer (nullable = true)
 |-- Feathers: integer (nullable = true)
 |-- Eggs: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Airborne: integer (nullable = true)
 |-- Aquatic: integer (nullable = true)
 |-- Predator: integer (nullable = true)
 |-- Toothed: integer (nullable = true)
 |-- Backbone: integer (nullable = true)
 |-- Breathes: integer (nullable = true)
 |-- Venomous: integer (nullable = true)
 |-- Fins: integer (nullable = true)
 |-- Legs: integer (nullable = true)
 |-- Tail: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Catsize: integer (nullable = true)
 |-- Type: integer (nullable = true)
 |-- IsMammal: integer (nullable = true)



In [9]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [10]:
indexer=StringIndexer().setInputCol("AnimalName").setOutputCol("Animal_index")

In [11]:
indexed_zooDF=indexer.fit(zooDf).transform(zooDf)

In [12]:
indexed_zooDF.select("AnimalName","Animal_index","Type","IsMammal").show()

+----------+------------+----+--------+
|AnimalName|Animal_index|Type|IsMammal|
+----------+------------+----+--------+
|  aardvark|         1.0|   1|       1|
|  antelope|         2.0|   1|       1|
|      bass|         3.0|   4|       0|
|      bear|         4.0|   1|       1|
|      boar|         5.0|   1|       1|
|   buffalo|         6.0|   1|       1|
|      calf|         7.0|   1|       1|
|      carp|         8.0|   4|       0|
|   catfish|         9.0|   4|       0|
|      cavy|        10.0|   1|       1|
|   cheetah|        11.0|   1|       1|
|   chicken|        12.0|   2|       0|
|      chub|        13.0|   4|       0|
|      clam|        14.0|   7|       0|
|      crab|        15.0|   7|       0|
|  crayfish|        16.0|   7|       0|
|      crow|        17.0|   2|       0|
|      deer|        18.0|   1|       1|
|   dogfish|        19.0|   4|       0|
|   dolphin|        20.0|   1|       1|
+----------+------------+----+--------+
only showing top 20 rows



In [13]:
from pyspark.ml.feature import VectorAssembler


In [14]:
feature_assembler=VectorAssembler().setInputCols(['Hair',\
                                                     'Feathers',
                                                     'Eggs',
                                                     'Milk',
                                                     'Airborne',
                                                     'Aquatic',
                                                     'Predator',
                                                     'Toothed',
                                                     'Backbone',
                                                     'Breathes',
                                                     'Venomous',
                                                     'Fins',
                                                     'Legs',
                                                     'Tail',
                                                     'Domestic',
                                                     'Catsize',
                                                     'Type',
                                                     'Animal_index']).setOutputCol("features")

In [15]:
finalized_zooDF=feature_assembler.transform(indexed_zooDF)

In [16]:
finalized_zooDF=finalized_zooDF.select("AnimalName","Type","Animal_index","IsMammal","features")

In [17]:
finalized_zooDF.show()

+----------+----+------------+--------+--------------------+
|AnimalName|Type|Animal_index|IsMammal|            features|
+----------+----+------------+--------+--------------------+
|  aardvark|   1|         1.0|       1|(18,[0,3,6,7,8,9,...|
|  antelope|   1|         2.0|       1|(18,[0,3,7,8,9,12...|
|      bass|   4|         3.0|       0|(18,[2,5,6,7,8,11...|
|      bear|   1|         4.0|       1|(18,[0,3,6,7,8,9,...|
|      boar|   1|         5.0|       1|[1.0,0.0,0.0,1.0,...|
|   buffalo|   1|         6.0|       1|(18,[0,3,7,8,9,12...|
|      calf|   1|         7.0|       1|[1.0,0.0,0.0,1.0,...|
|      carp|   4|         8.0|       0|(18,[2,5,7,8,11,1...|
|   catfish|   4|         9.0|       0|(18,[2,5,6,7,8,11...|
|      cavy|   1|        10.0|       1|(18,[0,3,7,8,9,12...|
|   cheetah|   1|        11.0|       1|[1.0,0.0,0.0,1.0,...|
|   chicken|   2|        12.0|       0|(18,[1,2,4,8,9,12...|
|      chub|   4|        13.0|       0|(18,[2,5,6,7,8,11...|
|      clam|   7|       

In [18]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator


train_data,test_data=finalized_zooDF.randomSplit([0.7,0.3])

In [19]:
lr=LogisticRegression(featuresCol="features",labelCol="IsMammal")

In [20]:
lrModel=lr.fit(train_data)

In [21]:
lrModel.predict(test_data.head()["features"])

1.0

In [26]:
predictions_result=lrModel.evaluate(test_data)

In [27]:
predictions_result.predictions.show()

+----------+----+------------+--------+--------------------+--------------------+--------------------+----------+
|AnimalName|Type|Animal_index|IsMammal|            features|       rawPrediction|         probability|prediction|
+----------+----+------------+--------+--------------------+--------------------+--------------------+----------+
|  antelope|   1|         2.0|       1|(18,[0,3,7,8,9,12...|[-30.907757945613...|[3.77512447740076...|       1.0|
|   buffalo|   1|         6.0|       1|(18,[0,3,7,8,9,12...|[-30.739407061549...|[4.46730018353973...|       1.0|
|   catfish|   4|         9.0|       0|(18,[2,5,6,7,8,11...|[21.3176796124320...|[0.99999999944811...|       0.0|
|  crayfish|   7|        16.0|       0|(18,[2,5,6,12,16,...|[29.7299400312413...|[0.99999999999987...|       0.0|
|      deer|   1|        18.0|       1|(18,[0,3,7,8,9,12...|[-30.234354409357...|[7.40264147471770...|       1.0|
|      frog|   5|         0.0|       0|(18,[2,5,6,7,8,9,...|[17.6081720434943...|[0.9999

In [28]:
predictions_result.accuracy

1.0

In [34]:
lrModel.predict(test_data.head()["features"])

1.0