In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky


In [3]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [4]:
spark = SparkSession \
            .builder \
            .master("local[*]")\
            .appName("New-Spark") \
            .config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", "10g") \
            .config("spark.driver.memory", "10g")\
            .config("spark.sql.shuffle.partitions" , "800") \
            .config("spark.memory.offHeap.enabled",'true')\
            .config("spark.memory.offHeap.size","10g")\
            .getOrCreate()
spark

Đọc data, thêm cột label cho từng tệp data:
- Fake: 0
- Real: 1

In [5]:
folder = "data/fake-and-real-news-dataset/"
fake_df = spark.read.csv(folder+'Fake.csv', header=True, inferSchema=True)
fake_df = fake_df.withColumn('label', lit(0))

real_df = spark.read.csv(folder+'True.csv', header=True, inferSchema=True)
real_df = real_df.withColumn('label', lit(1))

df = fake_df.union(real_df)
df.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|label|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



Convert sang datetime format, để lấy month, year

In [6]:
def convert_datetime(x):
    from datetime import datetime
    try:
        return datetime.strptime(x, '%B %d, %Y')
    except Exception:
        return None
func = udf(lambda x: convert_datetime(x), DateType())
df = df.withColumn('date_converted', func(df.date))
df.show(5)

+--------------------+--------------------+-------+-----------------+-----+--------------+
|               title|                text|subject|             date|label|date_converted|
+--------------------+--------------------+-------+-----------------+-----+--------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|    2017-12-31|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|    2017-12-31|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|    2017-12-30|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|    2017-12-29|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|    2017-12-25|
+--------------------+--------------------+-------+-----------------+-----+--------------+
only showing top 5 rows



Kiểm tra Null values

In [7]:
print('Number of Null values')
for col_name in df.columns:
    print('\t -', col_name, ": ", df.where(df[col_name].isNull()).count())

Number of Null values
	 - title :  0
	 - text :  8
	 - subject :  8
	 - date :  8
	 - label :  0
	 - date_converted :  33666


In [8]:
df.where(col('date_converted').isNull()).show(5)

+--------------------+--------------------+--------------------+--------------------+-----+--------------+
|               title|                text|             subject|                date|label|date_converted|
+--------------------+--------------------+--------------------+--------------------+-----+--------------+
| John McCain Want...|"All Senator John...| 2017I am so disg...|              2017So|    0|          null|
| Trump Wakes Up T...|"Donald Trump thi...| 2017Disrespect i...| 2017According to...|    0|          null|
| While Honoring N...|"Former reality s...| 2017We re not su...|    but the did that|    0|          null|
| Breitbart Editor...|"A Breitbart News...| 2017Pollack prev...| both of which wo...|    0|          null|
| Trumpsters Launc...|"Senator John McC...| 2017Did @SenJohn...| 2017  @senJohnMc...|    0|          null|
+--------------------+--------------------+--------------------+--------------------+-----+--------------+
only showing top 5 rows



Có vấn đề với format của cột date, theo như df show các date không được format, có thể lọc ra để lấy feature year.

In [9]:
def extract_year(x):
    import re
    pattern = r'(\.*)(\d\d\d\d)(\.*)'
    try:
        return int(re.search(pattern, x)[0])
    except Exception:
        return None
extract_func = udf(lambda x: extract_year(x), IntegerType())
df = df.withColumn('year', extract_func(col('date')))
df.show(5)

+--------------------+--------------------+-------+-----------------+-----+--------------+----+
|               title|                text|subject|             date|label|date_converted|year|
+--------------------+--------------------+-------+-----------------+-----+--------------+----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    0|    2017-12-31|2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    0|    2017-12-31|2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    0|    2017-12-30|2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    0|    2017-12-29|2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    0|    2017-12-25|2017|
+--------------------+--------------------+-------+-----------------+-----+--------------+----+
only showing top 5 rows



In [10]:
print('Number of Null values')
for col_name in df.columns:
    print('\t -', col_name, ": ", df.where(df[col_name].isNull()).count())

Number of Null values
	 - title :  0
	 - text :  8
	 - subject :  8
	 - date :  8
	 - label :  0
	 - date_converted :  33666
	 - year :  937


Kiểm tra số lượng không filter ra được year chỉ có 937, khá ít so với tổng số lượng mẫu, nên có thể dropna

In [11]:
df = df.dropna(subset=['year'])

In [12]:
df.groupby('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2018|    1|
|2013|    3|
|1000|    1|
|2011|    1|
|8746|    1|
|2016|16124|
|7612|    1|
|2012|    1|
|2017|25370|
|2015| 2458|
|2014|    3|
|2019|    1|
|2020|    4|
+----+-----+



Tiếp tục bỏ đi các mẫu không phải là số năm

In [13]:
df = df.filter(~df.year.isin([1000, 8746, 7612]))
df.groupby('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2018|    1|
|2013|    3|
|2011|    1|
|2016|16124|
|2012|    1|
|2017|25370|
|2015| 2458|
|2014|    3|
|2019|    1|
|2020|    4|
+----+-----+



In [14]:
df.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|22640|
|    1|21326|
+-----+-----+



In [16]:
df.select('subject').distinct().count()

281

In [17]:
df.count()

43966

Nhận xét:
- Bài toán đặt ra thuộc nhóm Classification - Fake - Real News Prediction.
- Không có imbalanced trong biến label.
- Bài toán tập trung vào xử lý NLP
- Các biến numeric: Year, len_title, len_text, len_subject
- Các biến NLP: title, text, subject
- Có thể gộp các biến NLP thành 1 features text và từ đó xử lý

Thực hiện pre-processing data

In [18]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, \
                                Tokenizer, StopWordsRemover, CountVectorizer, IDF, \
                                StandardScaler
from pyspark.ml.pipeline import Pipeline

df = df.select('title', 'text', 'subject', 'year', 'label')
df = df.withColumn('len_title', length(col('title')))
df = df.withColumn('len_text', length(col('text')))
df = df.withColumn('len_subject', length(col('subject')))
df = df.withColumn('all_text', concat(col('text'), lit(' '), col('title'), lit(' '), col('subject')))

onehot = OneHotEncoder(inputCols=['year'],\
                       outputCols=['year_dummy'])

tokenizer_all_text = Tokenizer(inputCol='all_text', outputCol='all_text_words')
remover_all_text = StopWordsRemover(inputCol='all_text_words', outputCol='all_text_filtered')
count_vec_all_text = CountVectorizer(inputCol='all_text_filtered', outputCol='all_text_count', vocabSize=1000)
idf_all_text = IDF(inputCol='all_text_count', outputCol='all_text_idf')

vector_assembler = VectorAssembler(inputCols=['year_dummy',
                                              'len_title', 'len_text', 'len_subject', \
                                            #   'title_idf', 'text_idf', 'subject_idf', \
                                              'all_text_idf'], outputCol='non_scale_features')

scaler = StandardScaler(inputCol="non_scale_features", outputCol="features")

pre_process_pipeline = Pipeline(stages=[onehot, \
                                        tokenizer_all_text, remover_all_text, count_vec_all_text, idf_all_text, \
                                        vector_assembler, scaler])

pre_process_pipeline_fit = pre_process_pipeline.fit(df)
final_df = pre_process_pipeline_fit.transform(df)

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)
final_df.show(5)

+--------------------+--------------------+-------+----+-----+---------+--------+-----------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|                text|subject|year|label|len_title|len_text|len_subject|            all_text|         year_dummy|      all_text_words|   all_text_filtered|      all_text_count|        all_text_idf|  non_scale_features|            features|
+--------------------+--------------------+-------+----+-----+---------+--------+-----------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| Donald Trump Sen...|Donald Trump just...|   News|2017|    0|       79|    2893|          4|Donald Trump just...|(2020,[2017],[1.0])|[donald, trump, j...|[donald, trump, c...|(1000,[0,1,4,5,6,...|(1000,[0,1,4,5,6,...

Thực hiện build models

In [19]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

list_model = [('Logistic Regression', LogisticRegression()),
              ('Decision Tree', DecisionTreeClassifier(seed=1)), 
              ('Random Forest', RandomForestClassifier(seed=1)), 
              ('Gradient Boosting', GBTClassifier(seed=1)),
              ('Linear SVC', LinearSVC()),
            #   ('Multilayer Perception', MultilayerPerceptronClassifier(maxIter=100, layers=[5, 4, 3], blockSize=128, seed=1)),
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    evaluator = MulticlassClassificationEvaluator()
    print('-'*30)
    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
    print('  AUC \t\t\t: {:.4f}'.format(evaluator.evaluate(predictions)))
    result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    display(result_confusion_matrix)

------------------------------
[1mLogistic Regression[0m

  Accuracy 		: 0.9997
  Precisions (label=1)	: 1.0000
  Recall (label=1)	: 0.9993
  f1_score (label=1)	: 0.9996
  AUC 			: 0.9997


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4427.0,0.0
Actual Pos,3.0,4194.0


------------------------------
[1mDecision Tree[0m

  Accuracy 		: 0.9998
  Precisions (label=1)	: 0.9998
  Recall (label=1)	: 0.9998
  f1_score (label=1)	: 0.9998
  AUC 			: 0.9998


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4426.0,1.0
Actual Pos,1.0,4196.0


------------------------------
[1mRandom Forest[0m

  Accuracy 		: 0.9999
  Precisions (label=1)	: 1.0000
  Recall (label=1)	: 0.9998
  f1_score (label=1)	: 0.9999
  AUC 			: 0.9999


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4427.0,0.0
Actual Pos,1.0,4196.0


------------------------------
[1mGradient Boosting[0m

  Accuracy 		: 0.9998
  Precisions (label=1)	: 0.9998
  Recall (label=1)	: 0.9998
  f1_score (label=1)	: 0.9998
  AUC 			: 0.9998


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4426.0,1.0
Actual Pos,1.0,4196.0


------------------------------
[1mLinear SVC[0m

  Accuracy 		: 1.0000
  Precisions (label=1)	: 1.0000
  Recall (label=1)	: 1.0000
  f1_score (label=1)	: 1.0000
  AUC 			: 1.0000


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,4427.0,0.0
Actual Pos,0.0,4197.0


- Model Linear SVC hoạt động hoàn hảo đối với data này, predict chính xác 100%
- Các model còn lại cũng có độ chính xác rất cao, trên 99.9%