In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df=spark.read.csv('covid.csv',inferSchema=True,header=True)
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- sex: integer (nullable = true)
 |-- patient_type: integer (nullable = true)
 |-- entry_date: string (nullable = true)
 |-- date_symptoms: string (nullable = true)
 |-- date_died: string (nullable = true)
 |-- intubed: integer (nullable = true)
 |-- pneumonia: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- pregnancy: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- copd: integer (nullable = true)
 |-- asthma: integer (nullable = true)
 |-- inmsupr: integer (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- other_disease: integer (nullable = true)
 |-- cardiovascular: integer (nullable = true)
 |-- obesity: integer (nullable = true)
 |-- renal_chronic: integer (nullable = true)
 |-- tobacco: integer (nullable = true)
 |-- contact_other_covid: integer (nullable = true)
 |-- covid_res: integer (nullable = true)
 |-- icu: integer (nullable = true)



In [2]:
#shape of dataset
print((df.count(),len(df.columns)))

(566602, 23)


In [3]:
from pyspark.sql.functions import col,isnan, when, count
#counting null values
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+---+------------+----------+-------------+---------+-------+---------+---+---------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+-------------------+---------+---+
| id|sex|patient_type|entry_date|date_symptoms|date_died|intubed|pneumonia|age|pregnancy|diabetes|copd|asthma|inmsupr|hypertension|other_disease|cardiovascular|obesity|renal_chronic|tobacco|contact_other_covid|covid_res|icu|
+---+---+------------+----------+-------------+---------+-------+---------+---+---------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+-------------------+---------+---+
|  0|  0|           0|         0|            0|        0|      0|        0|  0|        0|       0|   0|     0|      0|           0|            0|             0|      0|            0|      0|                  0|        0|  0|
+---+---+------------+----------+-------------+---------+-------+---------+---+---------+--------+--

In [4]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- sex: integer (nullable = true)
 |-- patient_type: integer (nullable = true)
 |-- entry_date: string (nullable = true)
 |-- date_symptoms: string (nullable = true)
 |-- date_died: string (nullable = true)
 |-- intubed: integer (nullable = true)
 |-- pneumonia: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- pregnancy: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- copd: integer (nullable = true)
 |-- asthma: integer (nullable = true)
 |-- inmsupr: integer (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- other_disease: integer (nullable = true)
 |-- cardiovascular: integer (nullable = true)
 |-- obesity: integer (nullable = true)
 |-- renal_chronic: integer (nullable = true)
 |-- tobacco: integer (nullable = true)
 |-- contact_other_covid: integer (nullable = true)
 |-- covid_res: integer (nullable = true)
 |-- icu: integer (nullable = true)



In [5]:
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
sex,566602,1.50672606167998,0.4999551992349244,1,2
patient_type,566602,1.2151651423750711,0.4109372238196795,1,2
intubed,566602,76.56295247810633,39.05867551766631,1,99
pneumonia,566602,1.846262455833195,0.5609386442720459,1,99
age,566602,42.62248280097847,16.659972551727172,0,120
pregnancy,566602,50.40069219663891,47.50157914867205,1,98
diabetes,566602,2.2106328604558403,5.683522656741282,1,98
copd,566602,2.280221389970385,5.327832491046167,1,98
asthma,566602,2.2650290680230567,5.334658337099209,1,98


In [6]:
df.groupby("intubed").count().show()

+-------+------+
|intubed| count|
+-------+------+
|      1|  9965|
|     97|444689|
|      2|111824|
|     99|   124|
+-------+------+



In [7]:
df = df[df["intubed"].isin([1,2])]

In [8]:
df.groupby("intubed").count().show()

+-------+------+
|intubed| count|
+-------+------+
|      1|  9965|
|      2|111824|
+-------+------+



In [9]:
from pyspark.mllib.stat import Statistics
import pandas as pd
numeric_features = [t[0] for t in df.dtypes if t[1] != 'string']
numeric_features_df=df.select(numeric_features)
col_names =numeric_features_df.columns
features = numeric_features_df.rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

corr_df

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
sex,1.0,,-0.033934,-0.052843,0.010195,0.997102,0.01313,0.013351,0.012019,0.009947,0.017193,0.011676,0.01143,0.012914,0.012469,0.005268,-0.010786,-0.048009,-0.021259
patient_type,,1.0,,,,,,,,,,,,,,,,,
intubed,-0.033934,,1.0,0.191114,-0.016501,-0.034933,-0.048555,-0.053973,-0.054472,-0.052479,-0.051198,-0.053135,-0.05325,-0.045525,-0.054441,-0.052354,0.175856,0.055252,0.320389
pneumonia,-0.052843,,0.191114,1.0,-0.130582,-0.053345,-0.004843,-0.00804,-0.007444,-0.011809,-0.003763,-0.023055,-0.008891,-0.01252,-0.006132,-0.008581,0.172095,0.128788,0.09587
age,0.010195,,-0.016501,-0.130582,1.0,0.009322,-0.010418,0.002186,0.005803,0.008124,-0.020552,-0.00403,0.001423,0.001734,0.004187,0.001365,0.094543,-0.08919,0.018917
pregnancy,0.997102,,-0.034933,-0.053345,0.009322,1.0,0.014309,0.01464,0.013546,0.011332,0.018695,0.012716,0.012637,0.014381,0.013725,0.006715,-0.012045,-0.047246,-0.021503
diabetes,0.01313,,-0.048555,-0.004843,-0.010418,0.014309,1.0,0.796668,0.811972,0.758986,0.835016,0.602933,0.775322,0.647799,0.769039,0.73313,-0.012721,-0.000448,-0.023897
copd,0.013351,,-0.053973,-0.00804,0.002186,0.01464,0.796668,1.0,0.891677,0.824024,0.828441,0.656269,0.838286,0.698399,0.833101,0.796659,-0.004146,-0.006965,-0.017158
asthma,0.012019,,-0.054472,-0.007444,0.005803,0.013546,0.811972,0.891677,1.0,0.861738,0.851225,0.67143,0.8606,0.720859,0.856113,0.817424,-0.004643,-0.006152,-0.020303
inmsupr,0.009947,,-0.052479,-0.011809,0.008124,0.011332,0.758986,0.824024,0.861738,1.0,0.809439,0.715669,0.825476,0.685236,0.816183,0.777784,-0.003188,-0.007522,0.009172


In [10]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler
# encoder = OneHotEncoder().setInputCols(["intubed"]).setOutputCols(["intubed_encoded"])

# encoder_model=encoder.fit(Age_udfIndex_df)
# encoder_df=encoder_model.transform(Age_udfIndex_df)

# encoder_df.toPandas().head()

assembler = VectorAssembler().setInputCols(["intubed","pregnancy","diabetes", "asthma","obesity","tobacco","age"]).setOutputCol("vectorized_features")
        

assembler_df=assembler.transform(df)
assembler_df.head()

Row(id='167386', sex=1, patient_type=2, entry_date='06-04-2020', date_symptoms='01-04-2020', date_died='9999-99-99', intubed=2, pneumonia=2, age=54, pregnancy=2, diabetes=2, copd=2, asthma=2, inmsupr=2, hypertension=2, other_disease=2, cardiovascular=2, obesity=1, renal_chronic=2, tobacco=2, contact_other_covid=99, covid_res=1, icu=2, vectorized_features=DenseVector([2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 54.0]))

In [11]:
label_indexer = StringIndexer()\
         .setInputCol ("intubed")\
         .setOutputCol ("label")

label_indexer_model=label_indexer.fit(assembler_df)
label_indexer_df=label_indexer_model.transform(assembler_df)

label_indexer_df.select("intubed","label").toPandas().head()

Unnamed: 0,intubed,label
0,2,0.0
1,2,0.0
2,2,0.0
3,2,0.0
4,2,0.0


In [12]:
scaler = StandardScaler()\
         .setInputCol ("vectorized_features")\
         .setOutputCol ("features")
        
scaler_model=scaler.fit(label_indexer_df)
scaler_df=scaler_model.transform(label_indexer_df)
pd.set_option('display.max_colwidth', 40)
scaler_df.select("vectorized_features","features").head(5)

[Row(vectorized_features=DenseVector([2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 54.0]), features=DenseVector([7.2968, 0.043, 0.2804, 0.3012, 0.1442, 0.2879, 2.8075])),
 Row(vectorized_features=DenseVector([2.0, 97.0, 2.0, 2.0, 2.0, 2.0, 30.0]), features=DenseVector([7.2968, 2.0843, 0.2804, 0.3012, 0.2884, 0.2879, 1.5597])),
 Row(vectorized_features=DenseVector([2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 60.0]), features=DenseVector([7.2968, 0.043, 0.1402, 0.3012, 0.2884, 0.2879, 3.1194])),
 Row(vectorized_features=DenseVector([2.0, 97.0, 1.0, 2.0, 2.0, 2.0, 47.0]), features=DenseVector([7.2968, 2.0843, 0.1402, 0.3012, 0.2884, 0.2879, 2.4435])),
 Row(vectorized_features=DenseVector([2.0, 97.0, 2.0, 2.0, 2.0, 2.0, 63.0]), features=DenseVector([7.2968, 2.0843, 0.2804, 0.3012, 0.2884, 0.2879, 3.2754]))]

In [13]:
train, test = scaler_df.randomSplit([0.8, 0.2], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 97573
Test Dataset Count: 24216


In [14]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=5)
lrModel = lr.fit(train)
predictions = lrModel.transform(test)
#predictions_train = lrModel.transform(train)
predictions.select('label', 'features',  'rawPrediction', 'prediction', 'probability').toPandas().head(5)

Unnamed: 0,label,features,rawPrediction,prediction,probability
0,1.0,"[3.6483804380941285, 0.0429757814159...","[2.0712360033410384, -2.071236003341...",0.0,"[0.8880758755507225, 0.1119241244492..."
1,0.0,"[7.296760876188257, 0.04297578141594...","[5.123406972328286, -5.123406972328286]",0.0,"[0.9940795629373474, 0.0059204370626..."
2,0.0,"[7.296760876188257, 0.04297578141594...","[4.5051901357788, -4.5051901357788]",0.0,"[0.9890693116748779, 0.0109306883251..."
3,0.0,"[7.296760876188257, 2.08432539867332...","[2.201519188338225, -2.201519188338225]",0.0,"[0.9003858515641879, 0.0996141484358..."
4,0.0,"[7.296760876188257, 2.08432539867332...","[2.6332954637978205, -2.633295463797...",0.0,"[0.9329739208482177, 0.0670260791517..."


In [15]:
from pyspark.mllib.evaluation import MulticlassMetrics
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ", accuracy)
# Compute raw scores on the test set
from pyspark.mllib.evaluation import MulticlassMetrics

#generate model on splited dataset

results = predictions.select(['prediction', 'label'])
predictionAndLabels=results.rdd
metrics = MulticlassMetrics(predictionAndLabels)

cm=metrics.confusionMatrix().toArray()
accuracy=(cm[0][0]+cm[1][1])/cm.sum()
precision=(cm[0][0])/(cm[0][0]+cm[1][0])
recall=(cm[0][0])/(cm[0][0]+cm[0][1])
f = (2* precision * recall)/(precision+recall)
print("RandomForestClassifier: accuracy,precision,recall, f",accuracy,precision,recall, f)

Accuracy :  0.9240997687479352
RandomForestClassifier: accuracy,precision,recall, f 0.9240997687479352 0.9271934239221608 0.9954074741107609 0.9600903287445174


In [16]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, \
                 regParam=0.1, \
                 featuresCol="features", \
                 labelCol='label')
from pyspark.ml import Pipeline
pipeline_lsvc = Pipeline(stages=[lsvc])
pipelineModel_lsvc = pipeline_lsvc.fit(train)
predDF_lsvc = pipelineModel_lsvc.transform(test)
results = predDF_lsvc.select(['prediction', 'label'])
predictionAndLabels=results.rdd
metrics = MulticlassMetrics(predictionAndLabels)

cm=metrics.confusionMatrix().toArray()
accuracy=(cm[0][0]+cm[1][1])/cm.sum()
precision=(cm[0][0])/(cm[0][0]+cm[1][0])
recall=(cm[0][0])/(cm[0][0]+cm[0][1])
f = (2* precision * recall)/(precision+recall)
print("RandomForestClassifier: accuracy,precision,recall, f",accuracy,precision,recall, f)

RandomForestClassifier: accuracy,precision,recall, f 0.9171622068054179 0.9171622068054179 1.0 0.9567914530650928


In [17]:
import matplotlib.pyplot as plt
trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

<Figure size 640x480 with 1 Axes>

Training set areaUnderROC: 0.9502589764497429


In [18]:
predictions.select("intubed","pregnancy","diabetes", "asthma","obesity","tobacco","age", 'label', 'rawPrediction', 'prediction', 'probability').show(10)

+-------+---------+--------+------+-------+-------+---+-----+--------------------+----------+--------------------+
|intubed|pregnancy|diabetes|asthma|obesity|tobacco|age|label|       rawPrediction|prediction|         probability|
+-------+---------+--------+------+-------+-------+---+-----+--------------------+----------+--------------------+
|      1|        2|       1|     2|      1|      2| 48|  1.0|[2.07123600334103...|       0.0|[0.88807587555072...|
|      2|        2|       2|     2|      2|      2| 34|  0.0|[5.12340697232828...|       0.0|[0.99407956293734...|
|      2|        2|       2|     2|      2|      2| 47|  0.0|[4.5051901357788,...|       0.0|[0.98906931167487...|
|      2|       97|       1|     2|      1|      2| 75|  0.0|[2.20151918833822...|       0.0|[0.90038585156418...|
|      2|       97|       1|     2|      2|      1| 66|  0.0|[2.63329546379782...|       0.0|[0.93297392084821...|
|      2|       97|       2|     2|      2|      2| 44|  0.0|[3.63480406182874..