In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import SparkSession

# DATA PREPERATION

In [3]:
#dataset = pd.read_csv('Admission_Predict.csv')
spark = SparkSession.builder.getOrCreate()
#dataset_ps = spark.createDataFrame(dataset)
df=spark.read.csv('heart.csv',inferSchema=True,header=True)
d1 = pd.DataFrame([[66,1,1,160,246,0,1,120,1,0,1,3,1]] , columns =['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal'] )
sp = spark.createDataFrame(d1)
d1
df.show(5,False)
df.columns

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex|cp |trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope|ca |thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|63 |1  |3  |145     |233 |1  |0      |150    |0    |2.3    |0    |0  |1   |1     |
|37 |1  |2  |130     |250 |0  |1      |187    |0    |3.5    |0    |0  |2   |1     |
|41 |0  |1  |130     |204 |0  |0      |172    |0    |1.4    |2    |0  |2   |1     |
|56 |1  |1  |120     |236 |0  |1      |178    |0    |0.8    |2    |0  |2   |1     |
|57 |0  |0  |120     |354 |0  |1      |163    |1    |0.6    |2    |0  |2   |1     |
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'target']

# VECTORIZE 

In [4]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler


In [5]:
vc = VectorAssembler(inputCols = ['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal'] 
                     , outputCol = 'features')
features_df = vc.transform(df)
print("Feature------>")
print(features_df)
sp1=vc.transform(sp)
print("EXPERIMENTAL SP------->")
print(sp1)

Feature------>
DataFrame[age: int, sex: int, cp: int, trestbps: int, chol: int, fbs: int, restecg: int, thalach: int, exang: int, oldpeak: double, slope: int, ca: int, thal: int, target: int, features: vector]
EXPERIMENTAL SP------->
DataFrame[age: bigint, sex: bigint, cp: bigint, trestbps: bigint, chol: bigint, fbs: bigint, restecg: bigint, thalach: bigint, exang: bigint, oldpeak: bigint, slope: bigint, ca: bigint, thal: bigint, features: vector]


# Spliting model

In [6]:
model_df  = features_df.select('features','target') 
sp1_mod = features_df.select('features','target')
print(sp1_mod)
print(model_df)
train_df , test_df = model_df.randomSplit([0.7,0.3])
print("test:--")
print(test_df)
print("train:--")
print(train_df)
print("model:--")
print(model_df)


DataFrame[features: vector, target: int]
DataFrame[features: vector, target: int]
test:--
DataFrame[features: vector, target: int]
train:--
DataFrame[features: vector, target: int]
model:--
DataFrame[features: vector, target: int]


# CLASSIFICATION MODEL --- >  Logistic Regression

In [8]:
from pyspark.ml.classification import LogisticRegression
spark.version

'2.4.0'

In [9]:
nb = LogisticRegression(maxIter=10,regParam=0.3,elasticNetParam=0.8,featuresCol = 'features',labelCol='target')
nb_m = nb.fit(train_df)

In [23]:
dt_predictions = nb_m.transform(sp1_mod)
dt_predictions.select(['prediction']).show(1,False)

+----------+
|prediction|
+----------+
|1.0       |
+----------+
only showing top 1 row



In [24]:
dt_predictions = nb_m.transform(test_df)
dt_predictions.select(['prediction','target','probability']).show(20,False)

+----------+------+---------------------------------------+
|prediction|target|probability                            |
+----------+------+---------------------------------------+
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |0     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |0     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |0     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0.5714285714285714]|
|1.0       |1     |[0.4285714285714286,0

#  CONFUSION MATRIX 

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()


In [28]:
#confusion matrix
true_postives = dt_predictions[(dt_predictions.target == 1) & (dt_predictions.prediction == 1)].count()
true_negatives = dt_predictions[(dt_predictions.target == 0) & (dt_predictions.prediction == 0)].count()
false_positives = dt_predictions[(dt_predictions.target == 0) & (dt_predictions.prediction == 1)].count()
false_negatives = dt_predictions[(dt_predictions.target == 1) & (dt_predictions.prediction == 0)].count()

In [30]:
print (true_postives)
print (true_negatives)
print (false_positives)
print (false_negatives)
print(true_postives+true_negatives+false_positives+false_negatives)
print (dt_predictions.count())

56
0
41
0
97
97


In [32]:
recall = float(true_postives)/(true_postives + false_negatives)
print('recall---->')
print(recall)
precision = float(true_postives) / (true_postives + false_positives)
print('precision---->')
print(precision)
accuracy=float((true_postives+true_negatives) /(dt_predictions.count()))
print('accuracy------>')
print(accuracy)

recall---->
1.0
precision---->
0.5773195876288659
accuracy------>
0.5773195876288659


# # Decision Tree Classifier

Decision trees are widely used since they are easy to interpret, handle categorical features, extend to the multi-class classification, do not require feature scaling, and are able to capture non-linearities and feature interactions.

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'target', maxDepth = 3)
dtModel = dt.fit(train_df)
predictions = dtModel.transform(test_df)
predictions.select('target', 'prediction', 'probability').show(10)

+------+----------+--------------------+
|target|prediction|         probability|
+------+----------+--------------------+
|     1|       1.0|[0.28571428571428...|
|     1|       1.0|[0.28571428571428...|
|     1|       1.0|[0.28571428571428...|
|     0|       0.0|[0.96078431372549...|
|     1|       1.0|[0.12345679012345...|
|     1|       1.0|[0.12345679012345...|
|     1|       1.0|[0.28571428571428...|
|     0|       0.0|[0.88888888888888...|
|     1|       1.0|[0.12345679012345...|
|     1|       1.0|[0.12345679012345...|
+------+----------+--------------------+
only showing top 10 rows



# Random Forest Classifier

In [36]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'target')
rfModel = rf.fit(train_df)
predictions = rfModel.transform(test_df)
predictions.select('target', 'prediction', 'probability').show(10)

+------+----------+--------------------+
|target|prediction|         probability|
+------+----------+--------------------+
|     1|       1.0|[0.18407024145744...|
|     1|       1.0|[0.18379644790603...|
|     1|       1.0|[0.30735436976875...|
|     0|       0.0|[0.68320485316552...|
|     1|       1.0|[0.03701892560111...|
|     1|       1.0|[0.08493559226777...|
|     1|       1.0|[0.24076775963013...|
|     0|       0.0|[0.67157151518707...|
|     1|       1.0|[0.04471123329341...|
|     1|       1.0|[0.19592436259937...|
+------+----------+--------------------+
only showing top 10 rows



# Gradient-Boosted Tree Classifier

In [38]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10,labelCol = 'target')
gbtModel = gbt.fit(train_df)
predictions = gbtModel.transform(test_df)
predictions.select('target', 'prediction', 'probability').show(10)

+------+----------+--------------------+
|target|prediction|         probability|
+------+----------+--------------------+
|     1|       1.0|[0.06012673056092...|
|     1|       1.0|[0.08384677258236...|
|     1|       1.0|[0.10590518741031...|
|     0|       0.0|[0.92032306215624...|
|     1|       1.0|[0.06800352292569...|
|     1|       1.0|[0.06800352292569...|
|     1|       1.0|[0.02871550721821...|
|     0|       0.0|[0.86236790596391...|
|     1|       1.0|[0.06695369239070...|
|     1|       1.0|[0.09464001356760...|
+------+----------+--------------------+
only showing top 10 rows



#  Linear Support Vector Machine


In [8]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(featuresCol = 'features', labelCol = 'target')
lsvcModel = lsvc.fit(train_df)
predictions = lsvcModel.transform(test_df)
predictions.show(10)

+--------------------+------+--------------------+----------+
|            features|target|       rawPrediction|prediction|
+--------------------+------+--------------------+----------+
|(13,[0,1,3,4,7,10...|     0|[-0.4313503278604...|       1.0|
|(13,[0,1,3,4,7,10...|     1|[-1.5252154187336...|       1.0|
|(13,[0,1,3,4,7,10...|     1|[-1.3886456399779...|       1.0|
|(13,[0,3,4,7,9,10...|     1|[-0.5704845883426...|       1.0|
|(13,[0,3,4,7,9,10...|     1|[-0.7671796752333...|       1.0|
|(13,[0,3,4,7,9,11...|     0|[1.57293125998882...|       0.0|
|(13,[0,3,4,7,9,11...|     0|[4.28869630823163...|       0.0|
|[29.0,1.0,1.0,130...|     1|[-2.3184720405181...|       1.0|
|[34.0,1.0,3.0,118...|     1|[-2.9282014190577...|       1.0|
|[35.0,0.0,0.0,138...|     1|[-1.6062222734503...|       1.0|
+--------------------+------+--------------------+----------+
only showing top 10 rows



# Naive Bayes

In [10]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(featuresCol = 'features', labelCol = 'target')
nbModel = nb.fit(train_df)
predictions = nbModel.transform(test_df)
predictions.select('target', 'prediction').show(10)

+------+----------+
|target|prediction|
+------+----------+
|     0|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       0.0|
|     1|       0.0|
|     0|       0.0|
|     0|       0.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
+------+----------+
only showing top 10 rows

