In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName("").getOrCreate()

In [6]:
fake_df = spark.read.csv("../data/face-and-real-news-dataset/Fake.csv", header=True, inferSchema=True)
fake_df = fake_df.withColumn('label', lit(0))

real_df = spark.read.csv("../data/face-and-real-news-dataset/True.csv", header=True, inferSchema=True)
real_df = real_df.withColumn('label', lit(1))

df = fake_df.union(real_df)
df.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|label|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



In [8]:
df.count()

44906

In [9]:
from modules.utils import *

In [10]:
# def dropEmptyCell(pDf: pyspark.sql.DataFrame, pFeatures: List[str]):
df = dropEmptyCell(df, ['title', 'text'])

In [11]:
df.count()

44898

In [14]:
df.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|label|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



In [16]:
df = df.withColumn('all_text', concat(col('title'), lit(' '), col('text')))

In [17]:
from pyspark.ml.feature import VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.pipeline import Pipeline


tokenizer = RegexTokenizer(inputCol='all_text', outputCol='all_text_tok', pattern="\\W")
stopwordremover = StopWordsRemover(inputCol='all_text_tok', outputCol='all_text_stp')
countvectorizer = CountVectorizer(inputCol='all_text_stp', outputCol='all_text_cnt')
idf = IDF(inputCol='all_text_cnt', outputCol='all_text_idf')
assembler = VectorAssembler(inputCols=['all_text_idf'], outputCol='features')


pipeline = Pipeline(stages=[tokenizer, stopwordremover, countvectorizer, idf, assembler])

pipeline_model = pipeline.fit(df)
final_df = pipeline_model.transform(df)

train_df, test_df = final_df.randomSplit([0.8, 0.2])
final_df.show(5)

+--------------------+--------------------+-------+-----------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|                text|subject|             date|label|            all_text|        all_text_tok|        all_text_stp|        all_text_cnt|        all_text_idf|            features|
+--------------------+--------------------+-------+-----------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0| Donald Trump Sen...|[donald, trump, s...|[donald, trump, s...|(1000,[0,3,4,5,7,...|(1000,[0,3,4,5,7,...|(1000,[0,3,4,5,7,...|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0| Drunk Bragging T...|[drunk, bragging,...|[drunk, bragging,...|(1000,[0,1,3,7,10...|(1000,[0,1,3,7,10...|(1000,[0,1,3

In [19]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import *

list_model = [('Logistic Regression', LogisticRegression()),
              ('Decision Tree', DecisionTreeClassifier()), 
              ('Random Forest', RandomForestClassifier()), 
              ('Gradient Boosting', GBTClassifier()),
              ('Linear SVC', LinearSVC())
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)

    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
    prediction_and_label = predictions.select(['prediction', 'label'])


    metrics = MulticlassMetrics(prediction_and_label.rdd)
    evaluator = MulticlassClassificationEvaluator()


    print("\n\n")
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
    print('  AUC \t\t\t: {:.4f}'.format(evaluator.evaluate(predictions)))
    result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    display(result_confusion_matrix)




  Accuracy 		: 0.9879
  Precisions (label=1)	: 0.9909
  Recall (label=1)	: 0.9838
  f1_score (label=1)	: 0.9874
  AUC 			: 0.9879


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4657.0,39.0
Actual Pos,70.0,4258.0





  Accuracy 		: 0.9947
  Precisions (label=1)	: 0.9913
  Recall (label=1)	: 0.9977
  f1_score (label=1)	: 0.9945
  AUC 			: 0.9947


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4658.0,38.0
Actual Pos,10.0,4318.0





  Accuracy 		: 0.9928
  Precisions (label=1)	: 0.9903
  Recall (label=1)	: 0.9947
  f1_score (label=1)	: 0.9925
  AUC 			: 0.9928


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4654.0,42.0
Actual Pos,23.0,4305.0





  Accuracy 		: 0.9962
  Precisions (label=1)	: 0.9942
  Recall (label=1)	: 0.9979
  f1_score (label=1)	: 0.9961
  AUC 			: 0.9962


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4671.0,25.0
Actual Pos,9.0,4319.0





  Accuracy 		: 0.9942
  Precisions (label=1)	: 0.9949
  Recall (label=1)	: 0.9931
  f1_score (label=1)	: 0.9940
  AUC 			: 0.9942


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4674.0,22.0
Actual Pos,30.0,4298.0


> **Nhận xét**:
> * Các model dự đoán đều tốt nhưng với chi phí của Grdient Boosting và Random forest là quá lớn, nên decision tree là phù hợp rồi