In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col, lower, regexp_replace
file_review="/FileStore/tables/features_with_label02.txt"
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)
df = spark.read.option("sep","\t").csv(file_review)
df=df.select(col("_c0").cast("int").alias("num_of_words"),col("_c1").cast("int").alias("num_of_verbs"),col("_c2").cast("float").alias("avg_word_length"),
            col("_c3").cast("float").alias("emotiveness_ratio"),col("_c4").cast("int").alias("num_of_posi"),col("_c5").cast("int").alias("num_of_nega"),
            col("_c6").cast("float").alias("sentiment"),col("_c7").cast("int").alias("label"))
from pyspark.sql.functions import when
df=df.withColumn("label",when(df.label==-1,0).otherwise(1))
df=df.toPandas()
print(len(df))
#print(df.head())

df = spark.createDataFrame(df)
#df.show(20)
#df=df[0:10000]
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
features=np.array(["num_of_words","num_of_verbs","avg_word_length","emotiveness_ratio","num_of_posi","num_of_nega","sentiment"])
#print(type(features))
va=VectorAssembler(inputCols = features, outputCol='features')
va_df = va.transform(df)
va_df = va_df.select(["features","label"])
va_df.show(10)

(train, test) = va_df.randomSplit([0.7, 0.3],seed=2)
print(train.count())
print(test.count())
# train.show()
# test.show()
# test_to_pd=test.toPandas()
# print(test_to_pd.iloc[0:100,:])
lsvc = LinearSVC(labelCol="label", maxIter=100)
lsvc = lsvc.fit(train)

pred = lsvc.transform(test)
#pred.show(3)
evaluator=MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(pred)
 
print("accuracy score: ", accuracy)

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

tp = pred[(pred.label == 1) & (pred.prediction == 1)].count()
tn = pred[(pred.label == 0) & (pred.prediction == 0)].count()
fp = pred[(pred.label == 0) & (pred.prediction == 1)].count()
fn = pred[(pred.label == 1) & (pred.prediction == 0)].count()
fake_review_count=pred[pred.label==0].count()
fake_review_02=test[test.label==0].count()
print(fake_review_count)
print(fake_review_02)
print("tp:",tp)
print("tn:",tn)
print("fp:",fp)
print("fn:",fn)

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 

6682913
+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.0,2.0,4.88235...|    0|
|[118.0,21.0,5.533...|    0|
|[24.0,3.0,5.125,0...|    0|
|[24.0,3.0,5.125,0...|    0|
|[24.0,3.0,5.125,0...|    0|
|[24.0,3.0,5.125,0...|    0|
|[129.0,15.0,5.651...|    0|
|[79.0,11.0,5.3544...|    0|
|[40.0,5.0,5.69999...|    0|
|[21.0,1.0,6.23809...|    0|
+--------------------+-----+
only showing top 10 rows

4677339
2005574
accuracy score:  0.9700315221477741
60104
60104
tp: 1945470
tn: 0
fp: 60104
fn: 0
Confusion Matrix:
[[      0   60104]
 [      0 1945470]]
