In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext(master = 'local', appName = 'fake and real news')

spark = SparkSession(sc)

In [2]:
fake = spark.read.csv('fake-and-real-news-dataset/Fake.csv', inferSchema = True, header = True)
fake.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



In [3]:
fake.show(5)

+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
+--------------------+--------------------+-------+-----------------+
only showing top 5 rows



In [4]:
real = spark.read.csv('fake-and-real-news-dataset/True.csv', inferSchema = True, header = True)
real.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



In [5]:
real.show(5)

+--------------------+--------------------+------------+------------------+
|               title|                text|     subject|              date|
+--------------------+--------------------+------------+------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |
+--------------------+--------------------+------------+------------------+
only showing top 5 rows



In [6]:
fake.count()

23489

In [7]:
real.count()

21417

* Check whether 2 datasets have null values

In [8]:
from pyspark.sql.functions import *
real.select([count(when(isnan(col), col)).alias(col) for col in real.columns]).show()

+-----+----+-------+----+
|title|text|subject|date|
+-----+----+-------+----+
|    0|   0|      0|   0|
+-----+----+-------+----+



In [9]:
fake.select([count(when(isnan(col), col)).alias(col) for col in fake.columns]).show()

+-----+----+-------+----+
|title|text|subject|date|
+-----+----+-------+----+
|    0|   0|      0|   0|
+-----+----+-------+----+



In [10]:
real.select([count(when(isnull(col), col)).alias(col) for col in real.columns]).show()

+-----+----+-------+----+
|title|text|subject|date|
+-----+----+-------+----+
|    0|   0|      0|   0|
+-----+----+-------+----+



In [11]:
fake.select([count(when(isnull(col), col)).alias(col) for col in fake.columns]).show()

+-----+----+-------+----+
|title|text|subject|date|
+-----+----+-------+----+
|    0|   8|      8|   8|
+-----+----+-------+----+



In [12]:
fake.count() - fake.dropna(how = 'any').count()

8

==> The result demonstrate that 8 rows have null values on 3 columns simultaneously 

*Clear all 8 null rows

In [13]:
fake = fake.dropna(how = 'any')

*Creating 2 new label columns for fake and real before combining them into 1 dataset

In [14]:
real = real.withColumn('label', lit('real'))
real.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: string (nullable = false)



In [15]:
real.show(5)

+--------------------+--------------------+------------+------------------+-----+
|               title|                text|     subject|              date|label|
+--------------------+--------------------+------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | real|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 | real|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | real|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 | real|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 | real|
+--------------------+--------------------+------------+------------------+-----+
only showing top 5 rows



In [16]:
fake = fake.withColumn('label', lit('fake'))
fake.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: string (nullable = false)



In [17]:
fake.show(5)

+--------------------+--------------------+-------+-----------------+-----+
|               title|                text|subject|             date|label|
+--------------------+--------------------+-------+-----------------+-----+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017| fake|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017| fake|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017| fake|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017| fake|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017| fake|
+--------------------+--------------------+-------+-----------------+-----+
only showing top 5 rows



In [18]:
data = real.unionAll(fake)
data.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: string (nullable = false)



In [19]:
data.show(5)

+--------------------+--------------------+------------+------------------+-----+
|               title|                text|     subject|              date|label|
+--------------------+--------------------+------------+------------------+-----+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | real|
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 | real|
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 | real|
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 | real|
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 | real|
+--------------------+--------------------+------------+------------------+-----+
only showing top 5 rows



In [20]:
data.count()

44898

In [21]:
data.count() - data.distinct().count()

432

In [22]:
fake.count() - fake.distinct().count()

226

In [23]:
data_remove_dup = data.drop_duplicates()
data_remove_dup.count()

44466

* The data is now quite OK, we will creating new data with 2 columns text and title without subject and date

In [24]:
data_remove_dup = data_remove_dup.withColumn('text', concat(data_remove_dup.title, lit(' '), data_remove_dup.text))
data_remove_dup.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)
 |-- label: string (nullable = false)



In [25]:
data_remove_dup.select('text').show(10, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
data.select('title').show(10, False)

+----------------------------------------------------------------------------+
|title                                                                       |
+----------------------------------------------------------------------------+
|As U.S. budget fight looms, Republicans flip their fiscal script            |
|U.S. military to accept transgender recruits on Monday: Pentagon            |
|Senior U.S. Republican senator: 'Let Mr. Mueller do his job'                |
|FBI Russia probe helped by Australian diplomat tip-off: NYT                 |
|Trump wants Postal Service to charge 'much more' for Amazon shipments       |
|White House, Congress prepare for talks on spending, immigration            |
|Trump says Russia probe will be fair, but timeline unclear: NYT             |
|Factbox: Trump on Twitter (Dec 29) - Approval rating, Amazon                |
|Trump on Twitter (Dec 28) - Global Warming                                  |
|Alabama official to certify Senator-elect Jones tod

In [27]:
data_remove_dup = data_remove_dup.drop('title', 'subject', 'date')
data_remove_dup.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = false)



In [28]:
data_remove_dup.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|'Dreamer' immigra...| real|
|U.S. top court ta...| real|
|Factbox: Republic...| real|
|Trump starts payi...| real|
|Senate drops prop...| real|
+--------------------+-----+
only showing top 5 rows



In [29]:
data_remove_dup = data_remove_dup.withColumn('length', length(data_remove_dup.text))
data_remove_dup.groupBy('label').mean().show()

+-----+------------------+
|label|       avg(length)|
+-----+------------------+
| real|2433.4058743104993|
| fake|2577.0937432810147|
+-----+------------------+



==> The length does not affect to the final result

*Change label to indexed type:

In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import Tokenizer, RegexTokenizer, CountVectorizer, StringIndexer, StopWordsRemover

In [31]:
data_indexed = StringIndexer(inputCol = "label", outputCol = "category").fit(data_remove_dup).transform(data_remove_dup)
data_indexed.show(5)

+--------------------+-----+------+--------+
|                text|label|length|category|
+--------------------+-----+------+--------+
|'Dreamer' immigra...| real|  3507|     1.0|
|U.S. top court ta...| real|  3570|     1.0|
|Factbox: Republic...| real|  4104|     1.0|
|Trump starts payi...| real|  3498|     1.0|
|Senate drops prop...| real|  2666|     1.0|
+--------------------+-----+------+--------+
only showing top 5 rows



In [32]:
data_indexed = data_indexed.drop('label')

In [33]:
data_indexed = data_indexed.drop('length')

In [34]:
data_indexed.show(5)

+--------------------+--------+
|                text|category|
+--------------------+--------+
|'Dreamer' immigra...|     1.0|
|U.S. top court ta...|     1.0|
|Factbox: Republic...|     1.0|
|Trump starts payi...|     1.0|
|Senate drops prop...|     1.0|
+--------------------+--------+
only showing top 5 rows



*Change words in text to lower versions

In [35]:
data_lower = data_indexed.withColumn('text_lower', lower(col('text'))).drop('text')
data_lower.show(5)

+--------+--------------------+
|category|          text_lower|
+--------+--------------------+
|     1.0|'dreamer' immigra...|
|     1.0|u.s. top court ta...|
|     1.0|factbox: republic...|
|     1.0|trump starts payi...|
|     1.0|senate drops prop...|
+--------+--------------------+
only showing top 5 rows



*Tokenizer, Stopwords and Word2Vec

In [36]:
tokenizer = Tokenizer(inputCol = "text_lower", outputCol = "token_text")
data_token = tokenizer.transform(data_lower)
stopremove = StopWordsRemover(inputCol = "token_text", outputCol = "stop_tokens")
data_remove = stopremove.transform(data_token)

In [37]:
data_remove.show(5)

+--------+--------------------+--------------------+--------------------+
|category|          text_lower|          token_text|         stop_tokens|
+--------+--------------------+--------------------+--------------------+
|     1.0|'dreamer' immigra...|['dreamer', immig...|['dreamer', immig...|
|     1.0|u.s. top court ta...|[u.s., top, court...|[u.s., top, court...|
|     1.0|factbox: republic...|[factbox:, republ...|[factbox:, republ...|
|     1.0|trump starts payi...|[trump, starts, p...|[trump, starts, p...|
|     1.0|senate drops prop...|[senate, drops, p...|[senate, drops, p...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [38]:
data_remove = data_remove.drop('text_lower', 'token_text')
data_remove.show(5)

+--------+--------------------+
|category|         stop_tokens|
+--------+--------------------+
|     1.0|['dreamer', immig...|
|     1.0|[u.s., top, court...|
|     1.0|[factbox:, republ...|
|     1.0|[trump, starts, p...|
|     1.0|[senate, drops, p...|
+--------+--------------------+
only showing top 5 rows



In [39]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(inputCol = 'stop_tokens', outputCol = 'vector')
data_w2v = word2Vec.fit(data_remove).transform(data_remove)

In [40]:
data_w2v.show(5)

+--------+--------------------+--------------------+
|category|         stop_tokens|              vector|
+--------+--------------------+--------------------+
|     1.0|['dreamer', immig...|[0.06352122189607...|
|     1.0|[u.s., top, court...|[-0.0509598407186...|
|     1.0|[factbox:, republ...|[-0.0018984051128...|
|     1.0|[trump, starts, p...|[0.06840749537214...|
|     1.0|[senate, drops, p...|[0.01207167519045...|
+--------+--------------------+--------------------+
only showing top 5 rows



In [41]:
data_w2v.write.parquet('vector.parquet', mode = 'overwrite')

In [42]:
data_w2v = spark.read.parquet('vector.parquet')
data_w2v.show(5)

+--------+--------------------+--------------------+
|category|         stop_tokens|              vector|
+--------+--------------------+--------------------+
|     1.0|[white, house,, c...|[0.08762225573708...|
|     1.0|[ex-twitter, work...|[0.04719146362873...|
|     1.0|[u.s., lawmaker, ...|[0.06663004976322...|
|     1.0|[u.s., consumer, ...|[0.00476759932867...|
|     1.0|[u.s., tax, panel...|[0.07997246308433...|
+--------+--------------------+--------------------+
only showing top 5 rows



See many relating words with W2V

In [43]:
model = word2Vec.fit(data_remove)

In [44]:
model.findSynonyms('iran', 10).show()

+---------+------------------+
|     word|        similarity|
+---------+------------------+
|   tehran|0.8855655789375305|
|   iran’s|0.8205986022949219|
|    iran.|0.8182442784309387|
| tehran’s|0.7415925860404968|
|  iranian| 0.740491509437561|
|  tehran,|0.7126408815383911|
|  nuclear|0.7056465148925781|
|  spirit”|0.7000223994255066|
|lebanon’s|0.6926752924919128|
|quarreled|0.6920217871665955|
+---------+------------------+



In [45]:
model.findSynonyms('trump', 10).show()

+-----------+------------------+
|       word|        similarity|
+-----------+------------------+
|     trump,|0.8085310459136963|
|     trump.|0.7444906234741211|
|    trump’s| 0.716338038444519|
|trump.trump|0.6696704030036926|
|  trump.the|0.6006836295127869|
|    trump,”|0.5995489358901978|
|   trump.it| 0.591926097869873|
|   trump.in| 0.581426203250885|
|     trump:|  0.57182377576828|
|trump.after|0.5503908395767212|
+-----------+------------------+



In [46]:
data_w2v.select('vector').show(5, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------




In [47]:
final_data = data_w2v.drop('stop_tokens')

* Create features:

In [48]:
assembler = VectorAssembler(inputCols = ['vector'], outputCol = 'features')

In [49]:
final_data = assembler.transform(final_data).drop('vector')

In [50]:
final_data.show(5)

+--------+--------------------+
|category|            features|
+--------+--------------------+
|     1.0|[0.08762225573708...|
|     1.0|[0.04719146362873...|
|     1.0|[0.06663004976322...|
|     1.0|[0.00476759932867...|
|     1.0|[0.07997246308433...|
+--------+--------------------+
only showing top 5 rows



* Train many Machine Learning models:

In [51]:
train, test = final_data.randomSplit([0.7, 0.3])

# Support Vector Machines:

In [52]:
from pyspark.ml.classification import LinearSVC
lSVC = LinearSVC(maxIter=20, regParam=0.1, labelCol = 'category')

In [53]:
model = lSVC.fit(train)

In [54]:
results = model.transform(test)
results.show(5)

+--------+--------------------+--------------------+----------+
|category|            features|       rawPrediction|prediction|
+--------+--------------------+--------------------+----------+
|     0.0|[-0.1408423161134...|[1.57152366415625...|       0.0|
|     0.0|[-0.1403042609108...|[0.56450158372841...|       0.0|
|     0.0|[-0.1319853596805...|[-0.6371949892259...|       1.0|
|     0.0|[-0.1090847788660...|[0.22464434647313...|       0.0|
|     0.0|[-0.1063430423694...|[-0.5060617043174...|       1.0|
+--------+--------------------+--------------------+----------+
only showing top 5 rows



In [55]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(labelCol = 'category')
acc = acc_eval.evaluate(results)
acc

0.9808902649405264

In [56]:
acc_eval = BinaryClassificationEvaluator(labelCol = "category",metricName="areaUnderROC")
acc = acc_eval.evaluate(results)
acc

0.9979880685815544

In [57]:
results.groupBy('prediction', 'category').count().show()

+----------+--------+-----+
|prediction|category|count|
+----------+--------+-----+
|       1.0|     1.0| 6397|
|       0.0|     1.0|   83|
|       1.0|     0.0|  173|
|       0.0|     0.0| 6741|
+----------+--------+-----+



Misclassifying is rare.

In [59]:
TN = results.filter('prediction = 0 AND category = prediction').count()
TP = results.filter('prediction = 1 AND category = prediction').count()
FN = results.filter('prediction = 0 AND category != prediction').count()
FP = results.filter('prediction = 1 AND category != prediction').count()

In [60]:
print('Precision =', TP/(TP + FP))
print('Recall =', TP/(TP + FN))
print('Accuracy = ', (TN + TP)/(TN + TP + FN + FP))

Precision = 0.9736681887366819
Recall = 0.9871913580246914
Accuracy =  0.9808869643123787


All things are good.

# Decision Tree and Random Forest

In [57]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier

dctree = DecisionTreeClassifier(labelCol = 'category')

In [59]:
model = dctree.fit(train)

In [60]:
results = model.transform(test)
results.show(5)

+--------+--------------------+---------------+--------------------+----------+
|category|            features|  rawPrediction|         probability|prediction|
+--------+--------------------+---------------+--------------------+----------+
|     0.0|[-0.2358881495893...|[12872.0,605.0]|[0.95510870371744...|       0.0|
|     0.0|[-0.2204790926278...|   [767.0,47.0]|[0.94226044226044...|       0.0|
|     0.0|[-0.1978858760040...|[12872.0,605.0]|[0.95510870371744...|       0.0|
|     0.0|[-0.1913294444519...|[12872.0,605.0]|[0.95510870371744...|       0.0|
|     0.0|[-0.1785149660427...|[12872.0,605.0]|[0.95510870371744...|       0.0|
+--------+--------------------+---------------+--------------------+----------+
only showing top 5 rows



In [61]:
acc_eval = MulticlassClassificationEvaluator(labelCol = 'category')
acc = acc_eval.evaluate(results)
acc

0.9287010233188528

In [62]:
acc_eval = BinaryClassificationEvaluator(labelCol = "category",metricName="areaUnderROC")
acc = acc_eval.evaluate(results)
acc

0.8612633492599507

In [63]:
rf = RandomForestClassifier(labelCol = "category", 
                           featuresCol = "features",
                           numTrees = 300)
model2 = rf.fit(train)
results = model2.transform(test)

In [64]:
acc_eval = MulticlassClassificationEvaluator(labelCol = 'category')
acc = acc_eval.evaluate(results)
acc

0.9491755629491408

In [65]:
acc_eval = BinaryClassificationEvaluator(labelCol = "category",metricName="areaUnderROC")
acc = acc_eval.evaluate(results)
acc

0.9888739175382496

They are not better than SVM

   # We use Logistic Regression

In [67]:
from pyspark.ml.classification import LogisticRegression

lg = LogisticRegression(maxIter = 20, regParam = 0.3, labelCol = 'category')
model3 = lg.fit(train)
results = model3.transform(test)

In [68]:
acc_eval = MulticlassClassificationEvaluator(labelCol = 'category')
acc = acc_eval.evaluate(results)
acc

0.9586348385957217

In [69]:
acc_eval = BinaryClassificationEvaluator(labelCol = "category",metricName="areaUnderROC")
acc = acc_eval.evaluate(results)
acc

0.9926969184270436

# Gradient Boosted Tree

In [70]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(labelCol = 'category', maxIter = 20)
model4 = lg.fit(train)
results = model4.transform(test)

In [71]:
acc_eval = MulticlassClassificationEvaluator(labelCol = 'category')
acc = acc_eval.evaluate(results)
acc

0.9586348385957217

In [72]:
acc_eval = BinaryClassificationEvaluator(labelCol = "category",metricName="areaUnderROC")
acc = acc_eval.evaluate(results)
acc

0.9926969184270437

# Conclusion: 
The most effective model is Support Vector Machines with 98% accuracy and 99.8% (area under the ROC curve) that dataset is applied Word2Vec technology. We can save the most effective model for prediction in the future.