![](../images/FE_01.png)

# 1. Đọc dữ liệu

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Fake_and_Real_News").getOrCreate()

In [4]:
fake_news = spark.read.csv("../data/face-and-real-news-dataset/Fake.csv", inferSchema=True, header=True)
real_news = spark.read.csv("../data/face-and-real-news-dataset/True.csv", inferSchema=True, header=True)

* Tạo feature mới là `label` với 1 là **fake-news** và 0 là **real-news** 

In [5]:
from pyspark.sql.functions import lit

In [6]:
fake_news = fake_news.withColumn('label', lit(1))
real_news = real_news.withColumn('label', lit(0))

In [7]:
fake_news.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|label|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|    1|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|    1|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|    1|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|    1|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|    1|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



In [8]:
real_news.show(5)

+--------------------+--------------------+------------+------------------+-----+
|               title|                text|     subject|              date|label|
+--------------------+--------------------+------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    0|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |    0|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |    0|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |    0|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |    0|
+--------------------+--------------------+------------+------------------+-----+
only showing top 5 rows



# 2. Tiền xử lí dữ liệu
## 2.1. Tìm hiểu sơ bộ về dữ liệu

In [9]:
fake_news.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: integer (nullable = false)



In [10]:
real_news.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: integer (nullable = false)



In [11]:
fake_news.count(), real_news.count()

(23489, 21417)

> **Nhận xét:**
> * Dữ liệu khá cân bằng

## 2.2. Lựa chọn thuộc tính

> * Nhìn chung, ta thấy:
>   * Target variable: `label`
>   * Predictor variables: `title`, `text`, `subject`

In [12]:
features = ['title', 'text', 'subject', 'label']

In [13]:
fake_news_sub = fake_news.select(features)
real_news_sub = real_news.select(features)

In [14]:
fake_news_sub.show(5)

+--------------------+--------------------+-------+-----+
|               title|                text|subject|label|
+--------------------+--------------------+-------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|    1|
| Drunk Bragging T...|House Intelligenc...|   News|    1|
| Sheriff David Cl...|On Friday, it was...|   News|    1|
| Trump Is So Obse...|On Christmas day,...|   News|    1|
| Pope Francis Jus...|Pope Francis used...|   News|    1|
+--------------------+--------------------+-------+-----+
only showing top 5 rows



In [15]:
real_news_sub.show(5)

+--------------------+--------------------+------------+-----+
|               title|                text|     subject|label|
+--------------------+--------------------+------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|    0|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|    0|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|    0|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|    0|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|    0|
+--------------------+--------------------+------------+-----+
only showing top 5 rows



## 2.3. Xem xét missing observation

In [16]:
from pyspark.sql.functions import *

In [17]:
tmp = fake_news_sub.select([count(when((col(c) == '') | col(c).isNull() | isnan(c), c)).alias(c + " (% missing)") for c in fake_news_sub.columns]).toPandas()
tmp = tmp / fake_news_sub.count()

In [18]:
tmp

Unnamed: 0,title (% missing),text (% missing),subject (% missing),label (% missing)
0,0.0,0.000341,0.000341,0.0


In [19]:
tmp2 = real_news_sub.select([count(when((col(c) == '') | col(c).isNull() | isnan(c), c)).alias(c + " (% missing)") for c in real_news_sub.columns]).toPandas()
tmp2 = tmp2 / real_news_sub.count()

In [20]:
tmp2

Unnamed: 0,title (% missing),text (% missing),subject (% missing),label (% missing)
0,0.0,0.0,0.0,0.0


> **Nhận xét:**
> * Có missing observation $\Rightarrow$ xóa

In [21]:
fake_news_sub = fake_news_sub.filter(~col('text').isNull() & ~(col('text') == "") & ~(isnan('text')))
fake_news_sub = fake_news_sub.filter(~col('subject').isNull() & ~(col('subject') == "") & ~(isnan('subject')))

In [22]:
tmp = fake_news_sub.select([count(when((col(c) == '') | col(c).isNull() | isnan(c), c)).alias(c + " (% missing)") for c in fake_news_sub.columns]).toPandas()
tmp = tmp / fake_news_sub.count()

In [23]:
tmp

Unnamed: 0,title (% missing),text (% missing),subject (% missing),label (% missing)
0,0.0,0.0,0.0,0.0


In [24]:
fake_news_sub.count(), real_news_sub.count()

(23481, 21417)

## 2.4. Xóa các duplicated observation

In [25]:
1 - (fake_news_sub.distinct().count() / fake_news_sub.count())

0.011285720369660579

> **Nhận xét**:
> * Có các duplicated observation $\Rightarrow$ xóa

In [26]:
fake_news_sub = fake_news_sub.dropDuplicates()

In [27]:
1 - (fake_news_sub.distinct().count() / fake_news_sub.count())

0.0

In [28]:
1 - (real_news_sub.distinct().count() / real_news_sub.count())

0.009758602978941933

In [29]:
real_news_sub = real_news_sub.dropDuplicates()

In [30]:
1 - (real_news_sub.distinct().count() / real_news_sub.count())

0.0

> **Nhận xét:**
> * OKLA

# 3. Chuyển dữ liệu

In [31]:
from pyspark.sql.functions import col, concat

In [32]:
# fake_news_sub = fake_news_sub.withColumn('title_text', concat(fake_news_sub['title'], concat(lit(' '), fake_news_sub['text']))).select('title_text', 'subject', 'label')

fake_news_sub = fake_news_sub.select('text', 'subject', 'label')

In [33]:
fake_news_sub.head(1)

[Row(text='When Sen. Al Franken (D-MN) announced his plans to resign Thursday, he specifically called out Donald Trump over the Access Hollywood video and Roy Moore, an alleged pedophile who is running for the Senate seat in Alabama with the GOP s blessing. Donald Trump Jr., not known for being a brainiac, decided to mock Franken on Twitter as if he didn t listen to the Democrat s amazing speech. Junior invoked one of the former comedian s  Saturday Night Live  most well-known characters, writing,  because I m good enough, I m smart enough, and God-darnit people like me  and included the hashtag  #Franken. because I m good enough, I m smart enough, and God-darnit people like me. #Franken  Donald Trump Jr. (@DonaldJTrumpJr) December 7, 2017Twitter gave Junior a wake-up call:pic.twitter.com/NmuRm5MgMz  liberalgranny50 (@peppersandeggs) December 7, 2017I am sure daddy @realDonaldTrump can sympathize as he is a fellow sexual predator  pic.twitter.com/Qesftp1u28  Matt Slavin (@tHemAttsLavin

In [34]:
# real_news_sub = real_news_sub.withColumn('title_text', concat(real_news_sub['title'], concat(lit(' '), real_news_sub['text']))).select('title_text', 'subject', 'label')
real_news_sub = real_news_sub.select('text', 'subject', 'label')

In [35]:
real_news_sub.columns

['text', 'subject', 'label']

In [36]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer,VectorAssembler

In [37]:
indexer = StringIndexer(inputCol='subject', outputCol='subject_idx')

In [38]:
from pyspark.ml import Pipeline

In [39]:
# for c in ['title_text']:
for c in ['text']:
    reg_tokenizer = RegexTokenizer(inputCol=c, outputCol=(c + '_tok'), pattern="\\W")
    stopremove = StopWordsRemover(inputCol=(c + '_tok'), outputCol=(c + '_stp'))
    count_vec = CountVectorizer(inputCol=(c + '_stp'), outputCol=(c + '_cvt'))
    idf = IDF(inputCol=(c + '_cvt'), outputCol=(c + '_idf'))
    pipeline = Pipeline(stages=[reg_tokenizer, stopremove, count_vec, idf])
    fake_news_sub = pipeline.fit(fake_news_sub).transform(fake_news_sub)

In [40]:
fake_news_sub = indexer.fit(fake_news_sub).transform(fake_news_sub)

In [41]:
# for c in ['title_text']:
for c in ['text']:
    reg_tokenizer = RegexTokenizer(inputCol=c, outputCol=(c + '_tok'), pattern="\\W")
    stopremove = StopWordsRemover(inputCol=(c + '_tok'), outputCol=(c + '_stp'))
    count_vec = CountVectorizer(inputCol=(c + '_stp'), outputCol=(c + '_cvt'))
    idf = IDF(inputCol=(c + '_cvt'), outputCol=(c + '_idf'))
    pipeline = Pipeline(stages=[reg_tokenizer, stopremove, count_vec, idf])
    real_news_sub = pipeline.fit(real_news_sub).transform(real_news_sub)

In [42]:
real_news_sub = indexer.fit(real_news_sub).transform(real_news_sub)

In [43]:
fake_news_sub.show(5)

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+-----------+
|                text|             subject|label|            text_tok|            text_stp|            text_cvt|            text_idf|subject_idx|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+-----------+
|When Sen. Al Fran...|                News|    1|[when, sen, al, f...|[sen, al, franken...|(90715,[0,1,3,4,7...|(90715,[0,1,3,4,7...|        0.0|
|"On Tuesday after...| 2017I have no id...|    1|[on, tuesday, aft...|[tuesday, afterno...|(90715,[0,4,8,11,...|(90715,[0,4,8,11,...|      392.0|
|Before dawn on We...|                News|    1|[before, dawn, on...|[dawn, wednesday,...|(90715,[0,1,2,3,4...|(90715,[0,1,2,3,4...|        0.0|
|Donald Trump s so...|                News|    1|[donald, trump, s...|[donald, trump, s...|(90715,[0,1,5,7,8...|(90715,[0,1,

In [44]:
real_news_sub.show(5)

+--------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+-----------+
|                text|     subject|label|            text_tok|            text_stp|            text_cvt|            text_idf|subject_idx|
+--------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+-----------+
|WASHINGTON (Reute...|politicsNews|    0|[washington, reut...|[washington, reut...|(66256,[0,1,2,3,4...|(66256,[0,1,2,3,4...|        0.0|
|WASHINGTON (Reute...|politicsNews|    0|[washington, reut...|[washington, reut...|(66256,[0,1,3,4,8...|(66256,[0,1,3,4,8...|        0.0|
| WASHINGTON (Reut...|politicsNews|    0|[washington, reut...|[washington, reut...|(66256,[0,1,2,3,4...|(66256,[0,1,2,3,4...|        0.0|
|(Reuters) - The c...|politicsNews|    0|[reuters, the, co...|[reuters, communi...|(66256,[0,1,2,3,4...|(66256,[0,1,2,3,4...|        0.0|
|WASHINGTON (Reute...|politicsNews

# 4. Ghi ra file parquet

In [45]:
fake_news_sub.write.parquet("../data/face-and-real-news-dataset/fake_news_clean_data.parquet")

In [46]:
real_news_sub.write.parquet("../data/face-and-real-news-dataset/real_news_clean_data.parquet")