In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
import pandas as pd
from pyspark.sql import *
spark = SparkSession.builder.appName('Data_model').getOrCreate()
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import StructType, StructField, IntegerType

In [3]:
normalizer_df_pandas = pd.read_csv('/home/ubuntu/BDAS_yang124/Datasets/normalizer_df_pandas.csv')
normalizer_df = spark.createDataFrame(normalizer_df_pandas)

In [4]:
final_data = normalizer_df.selectExpr("`norm_features` as features","`PrivateIndex` as label")

final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[7.41517510961329...|  0.0|
|[7.69028602067676...|  0.0|
|[7.26473017792986...|  0.0|
|[6.53669159759130...|  0.0|
|[6.53669159759130...|  0.0|
|[6.53669159759130...|  0.0|
|[6.53669159759130...|  0.0|
|[7.54960916515453...|  0.0|
|[6.94601399109922...|  0.0|
|[6.53669159759130...|  0.0|
|[7.45760928971560...|  0.0|
|[7.88344635413774...|  0.0|
|[7.07326971745971...|  0.0|
|[7.14519613499717...|  0.0|
|[6.53669159759130...|  0.0|
|[7.25911612809710...|  0.0|
|[8.3670677328386,...|  0.0|
|[7.10414409298752...|  0.0|
|[7.03085747611612...|  0.0|
|[8.17216445211190...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [5]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,LogisticRegression,NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [6]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [7]:
lr = LogisticRegression(maxIter=10)
rfc = RandomForestClassifier(numTrees=10)
dtc = DecisionTreeClassifier()
nb = NaiveBayes()
lr_original_model=lr.fit(train_data)
rfc_original_model=rfc.fit(train_data)
dtc_original_model=dtc.fit(train_data)
nb_original_model=nb.fit(train_data)

lr_original_predictions=lr_original_model.transform(test_data)
rfc_original_predictions=rfc_original_model.transform(test_data)
dtc_original_predictions=dtc_original_model.transform(test_data)
nb_original_predictions=nb_original_model.transform(test_data)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_binary_eval = BinaryClassificationEvaluator()

print("DTC")
print(my_binary_eval.evaluate(dtc_original_predictions))


print("RFC")
print(my_binary_eval.evaluate(rfc_original_predictions))

print("LR")
print(my_binary_eval.evaluate(lr_original_predictions))

print("NB")
print(my_binary_eval.evaluate(nb_original_predictions))



IllegalArgumentException: 'requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually StringType.'

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", metricName="accuracy")
dtc_acc = acc_evaluator.evaluate(dtc_original_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_original_predictions)
lr_acc = acc_evaluator.evaluate(lr_original_predictions)
nb_acc = acc_evaluator.evaluate(nb_original_predictions)
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*40)
print('An ensemble using LR has an accuracy of: {0:2.2f}%'.format(lr_acc*100))
print('-'*40)
print('An ensemble using NB has an accuracy of: {0:2.2f}%'.format(nb_acc*100))

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[lr])

paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1,0.9]).addGrid(lr.regParam,[0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)  

lr_best_model = crossval.fit(train_data)
params = lr_best_model.getEstimatorParamMaps()
avgMetrics = lr_best_model.avgMetrics
all_params = list(zip(params, avgMetrics))
best_param = sorted(all_params, key=lambda x: x[1], reverse=True)[0]

print(best_param)

# prediction = lr_best_model.transform(test_data)
# prediction.show()


In [None]:
dtc = DecisionTreeClassifier()
pipeline = Pipeline(stages=[dtc])
paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth,[5, 10]).addGrid(dtc.minInfoGain,[0, 1]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)  
#  maxDepth  minInfoGain
dtc_best_Model = crossval.fit(train_data)


params = dtc_best_Model.getEstimatorParamMaps()
avgMetrics = dtc_best_Model.avgMetrics
all_params = list(zip(params, avgMetrics))
best_param = sorted(all_params, key=lambda x: x[1], reverse=True)[0]
# best_param.show()
print(best_param)

# prediction = dtc_best_Model.transform(test_data)
# prediction.show()

In [None]:
rfc = RandomForestClassifier(numTrees=10)
pipeline = Pipeline(stages=[rfc])
paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth,[5, 10]).addGrid(dtc.minInfoGain,[0, 1]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)  
# maxBins  maxDepth  minInfoGain
rfc_best_Model = crossval.fit(train_data)
# cvModel.extractParamMap

params = rfc_best_Model.getEstimatorParamMaps()
avgMetrics = rfc_best_Model.avgMetrics
all_params = list(zip(params, avgMetrics))
best_param = sorted(all_params, key=lambda x: x[1], reverse=True)[0]
# best_param.show()
print(best_param)

# prediction = rfc_best_Model.transform(test_data)
# prediction.show()

In [None]:
nb = NaiveBayes()
pipeline = Pipeline(stages=[nb])
paramGrid = ParamGridBuilder().addGrid(nb.smoothing,[1, 10]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)  
# maxBins  maxDepth  minInfoGain
nb_best_Model = crossval.fit(train_data)
# cvModel.extractParamMap

params = nb_best_Model.getEstimatorParamMaps()
avgMetrics = nb_best_Model.avgMetrics
all_params = list(zip(params, avgMetrics))
best_param = sorted(all_params, key=lambda x: x[1], reverse=True)[0]
# best_param.show()
print(best_param)

# prediction = nb_best_Model.transform(test_data)
# prediction.show()