In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)

In [7]:
#reading the dataset
zooDf=spark.read.csv("zoo.csv",header=True,inferSchema=True)
zooDf.printSchema()

root
 |-- AnimalName: string (nullable = true)
 |-- Hair: integer (nullable = true)
 |-- Feathers: integer (nullable = true)
 |-- Eggs: integer (nullable = true)
 |-- Milk: integer (nullable = true)
 |-- Airborne: integer (nullable = true)
 |-- Aquatic: integer (nullable = true)
 |-- Predator: integer (nullable = true)
 |-- Toothed: integer (nullable = true)
 |-- Backbone: integer (nullable = true)
 |-- Breathes: integer (nullable = true)
 |-- Venomous: integer (nullable = true)
 |-- Fins: integer (nullable = true)
 |-- Legs: integer (nullable = true)
 |-- Tail: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Catsize: integer (nullable = true)
 |-- Type: integer (nullable = true)



In [10]:
#creating a new column IsMammal where type=1 IsMammal=1, otherwise 0
#creating an IsMammal Function
from pyspark.sql.types import * ## importing all the datatypes
from pyspark.sql.functions import *

def IsMammal(x):
    if x==1:
        return 1
    else:
        return 0
#creating a User Defined Function to read the values in the column from the dataframe
IsMammaludf=udf(lambda x: IsMammal(x),IntegerType())

    

In [12]:
# Adding a new column named "IsMammal" to the dataframe
zooDf=zooDf.withColumn("IsMammal",IsMammaludf(zooDf["Type"]))


In [13]:
zooDf.columns

['AnimalName',
 'Hair',
 'Feathers',
 'Eggs',
 'Milk',
 'Airborne',
 'Aquatic',
 'Predator',
 'Toothed',
 'Backbone',
 'Breathes',
 'Venomous',
 'Fins',
 'Legs',
 'Tail',
 'Domestic',
 'Catsize',
 'Type',
 'IsMammal']

In [21]:
# Importing RFormula and using it to convert the dataframes into feature and label for further processing

from pyspark.ml.feature import RFormula
rf=RFormula(formula=" IsMammal ~ AnimalName + Hair + \
Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous \
+ Fins + Legs + Tail + Domestic + Catsize + Type",labelCol="IsMammal")


In [22]:
indexed_zooDf=rf.fit(zooDf).transform(zooDf)

In [23]:
indexed_zooDf.columns

['AnimalName',
 'Hair',
 'Feathers',
 'Eggs',
 'Milk',
 'Airborne',
 'Aquatic',
 'Predator',
 'Toothed',
 'Backbone',
 'Breathes',
 'Venomous',
 'Fins',
 'Legs',
 'Tail',
 'Domestic',
 'Catsize',
 'Type',
 'IsMammal',
 'features']

In [24]:
finalized_data=indexed_zooDf.select("AnimalName","Type","IsMammal","features")

In [25]:
finalized_data.show()

+----------+----+--------+--------------------+
|AnimalName|Type|IsMammal|            features|
+----------+----+--------+--------------------+
|  aardvark|   1|       1|(116,[1,99,102,10...|
|  antelope|   1|       1|(116,[2,99,102,10...|
|      bass|   4|       0|(116,[3,101,104,1...|
|      bear|   1|       1|(116,[4,99,102,10...|
|      boar|   1|       1|(116,[5,99,102,10...|
|   buffalo|   1|       1|(116,[6,99,102,10...|
|      calf|   1|       1|(116,[7,99,102,10...|
|      carp|   4|       0|(116,[8,101,104,1...|
|   catfish|   4|       0|(116,[9,101,104,1...|
|      cavy|   1|       1|(116,[10,99,102,1...|
|   cheetah|   1|       1|(116,[11,99,102,1...|
|   chicken|   2|       0|(116,[12,100,101,...|
|      chub|   4|       0|(116,[13,101,104,...|
|      clam|   7|       0|(116,[14,101,105,...|
|      crab|   7|       0|(116,[15,101,104,...|
|  crayfish|   7|       0|(116,[16,101,104,...|
|      crow|   2|       0|(116,[17,100,101,...|
|      deer|   1|       1|(116,[18,99,10

In [62]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train_data,test_data=finalized_data.randomSplit([0.3,0.7],1234)

In [63]:
lr=LogisticRegression(featuresCol="features",labelCol="IsMammal")

In [64]:
lrModel=lr.fit(train_data)

In [65]:
prediction_result=lrModel.evaluate(test_data)

In [66]:
evaluation=BinaryClassificationEvaluator(labelCol="IsMammal")

In [67]:
prediction_result.predictions.show()

+----------+----+--------+--------------------+--------------------+--------------------+----------+
|AnimalName|Type|IsMammal|            features|       rawPrediction|         probability|prediction|
+----------+----+--------+--------------------+--------------------+--------------------+----------+
|  aardvark|   1|       1|(116,[1,99,102,10...|[-14.979290920586...|[3.12303229126101...|       1.0|
|  antelope|   1|       1|(116,[2,99,102,10...|[-13.989301051809...|[8.40472257079126...|       1.0|
|      boar|   1|       1|(116,[5,99,102,10...|[-14.419092460311...|[5.46849063796294...|       1.0|
|   buffalo|   1|       1|(116,[6,99,102,10...|[-13.989301051809...|[8.40472257079126...|       1.0|
|      calf|   1|       1|(116,[7,99,102,10...|[-15.243838401230...|[2.39709361562307...|       1.0|
|      carp|   4|       0|(116,[8,101,104,1...|[16.3123731012228...|[0.99999991765706...|       0.0|
|   catfish|   4|       0|(116,[9,101,104,1...|[17.1371190421423...|[0.99999996390527...|  

In [68]:
prediction_result.accuracy

1.0

In [69]:
lrModel.summary.accuracy

1.0

In [70]:
prediction_result.areaUnderROC

1.0

In [71]:
lrModel.coefficients

SparseVector(116, {3: -2.5323, 4: 4.1229, 15: -1.1014, 17: -2.0966, 21: -2.6673, 29: -1.4707, 34: 5.0798, 36: -2.0966, 37: -2.5323, 42: -1.8355, 45: -1.2973, 48: 5.7024, 49: 4.5447, 53: 5.7024, 58: -1.8355, 59: -3.6638, 61: -4.2376, 74: 7.5597, 76: -0.4373, 78: -1.0639, 80: -1.6767, 81: -2.2439, 86: -1.6106, 87: -3.3841, 91: -3.6638, 96: -3.1972, 98: -1.6767, 99: 5.4813, 100: -2.4525, 101: -9.5583, 102: 6.7501, 103: -2.8686, 104: -1.8594, 105: 0.4298, 106: 1.9989, 107: 0.1082, 108: 0.6608, 109: -2.8427, 110: -1.4602, 111: 0.2307, 112: -0.5602, 113: 1.2545, 114: 1.5041, 115: -1.1198})

In [72]:
lrModel.intercept

-1.7567973580125655

In [None]:
lrModel.predict([])