# Important Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/marneusz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marneusz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/marneusz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [5]:
import random

In [6]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

['some', 'other', 'yourselves', 'do', 'does', 'yours', 'is', 'y', 'his', 'our']

Consider removing some stop words like _no_, _yes_, etc.

# Loading Data

In [7]:
DATASETS = {
    "FakeNews": "",
    "ISOT": ""
}

CUR_DATASET = "FakeNews"

In [8]:
train_dataset = pd.read_csv(f"./data/{CUR_DATASET}/train.csv.zip")
test_dataset = pd.read_csv(f"./data/{CUR_DATASET}/test.csv.zip")

In [9]:
train_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [10]:
labels = train_dataset["label"].values

In [11]:
whole_dataset = pd.concat([train_dataset, test_dataset])

# Some More EDA

In [12]:
train_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [13]:
train_dataset.isnull().sum() / train_dataset.shape[0]

id        0.000000
title     0.026827
author    0.094087
text      0.001875
label     0.000000
dtype: float64

In [14]:
whole_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      26000 non-null  int64  
 1   title   25320 non-null  object 
 2   author  23540 non-null  object 
 3   text    25954 non-null  object 
 4   label   20800 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB


# Data Preprocessing and Data Preparation

In [15]:
train_dataset.fillna("null")
test_dataset.fillna("null")

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [16]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    # "no_special_symbols": lambda txt: re.sub('[$, #, &]', '', txt),
    # "no_digits": lambda txt: re.sub('\d*', '', txt),
    # "no_www": lambda txt: re.sub('w{3}', '', txt),
    # "no_urls": lambda txt: re.sub('http\S+', '', txt),
    # "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    # "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [17]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [18]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,id,title,author,text,label
0,0,HouseDemAideWeDidntEvenSeeComeysLetterUntilJas...,Darrell Lucus,HouseDemAideWeDidntEvenSeeComeysLetterUntilJas...,1
1,1,FLYNNHillaryClintonBigWomanonCampusBreitbart,Daniel J. Flynn,Evergetthefeelingyourlifecirclestheroundaboutr...,0
2,2,WhytheTruthMightGetYouFired,Consortiumnews.com,WhytheTruthMightGetYouFiredOctoberThetensionbe...,1
3,3,CiviliansKilledInSingleUSAirstrikeHaveBeenIden...,Jessica Purkiss,VideosCiviliansKilledInSingleUSAirstrikeHaveBe...,1
4,4,Iranianwomanjailedforfictionalunpublishedstory...,Howard Portnoy,PrintAnIranianwomanhasbeensentencedtosixyearsi...,1
5,5,JackieMasonHollywoodWouldLoveTrumpifHeBombedNo...,Daniel Nussbaum,InthesetryingtimesJackieMasonistheVoiceofReaso...,0
6,6,LifeLifeOfLuxuryEltonJohnsFavoriteSharkPicture...,,EverwonderhowBritainsmosticonicpoppianistgetst...,1
7,7,BenotHamonWinsFrenchSocialistPartysPresidentia...,Alissa J. Rubin,PARISFrancechoseanidealistictraditionalcandida...,0
8,8,ExcerptsFromaDraftScriptforDonaldTrumpsQampAWi...,,DonaldJTrumpisscheduledtomakeahighlyanticipate...,0
9,9,ABackChannelPlanforUkraineandRussiaCourtesyofT...,Megan Twohey and Scott Shane,AweekbeforeMichaelTFlynnresignedasnationalsecu...,0
