# Fake News Capstone Project Data Wrangling and Preprocessing

In [1]:
import s3fs
import pandas as pd
pd.set_option('display.max_columns', 100000)
pd.set_option('display.max_row', 1000000)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import tldextract
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="progress-bar", leave=False)
import string
import spacy
from spacy.lang import punctuation
import unicodedata  # might need to pip install unicodedate2 on aws sagemaker
import contractions
from contractions import contractions_dict 
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import STOPWORDS
import warnings
from afinn import Afinn
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='darkgrid',palette='Dark2',rc={'figure.figsize':(9,6),'figure.dpi':90})

punctuation = string.punctuation + '”' + '“' + '–' + '““' + "’’" + '”'
stopword = stopwords.words('english')
stopwords = set(STOPWORDS)
wordnet_lemmatizer = WordNetLemmatizer()

  from tqdm.autonotebook import tqdm
  from pandas import Panel


## Random seed

In [2]:
seed = 123

## Read in the dataset

In [3]:
df = pd.read_csv('FRL_Step_1_news_cleaned_2018_02_13.csv')
df.head(5)

Unnamed: 0,domain,type,content,title
0,nytimes.com,real,The stunning announcement by Japanese and Amer...,Behind the Stem Cell Breakthrough
1,nytimes.com,real,We are halfway between the lunatic and the ter...,Quotation of the Day
2,nytimes.com,real,There were Jews in Manhattan before there was ...,Celebrating Sounds Rooted In Gritty but Fertil...
3,nytimes.com,real,To the Editor:\n\n“How Apple Sidesteps Billion...,"What Apple Pays in Taxes, and Doesn't"
4,nytimes.com,real,"Katharine Winnifred Shergalis, a daughter of M...","Katharine Shergalis, Thomas Ewald"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789474 entries, 0 to 1789473
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   domain   object
 1   type     object
 2   content  object
 3   title    object
dtypes: object(4)
memory usage: 54.6+ MB


## Clean up the domain and check the unique domains

In [5]:
def extract_domain(url):
    """
    Extract domain name from fld url
    """
    info = tldextract.extract(url)
    return info.domain

In [6]:
df['domain'] = df['domain'].astype(str).apply(extract_domain)
df.head(3)

Unnamed: 0,domain,type,content,title
0,nytimes,real,The stunning announcement by Japanese and Amer...,Behind the Stem Cell Breakthrough
1,nytimes,real,We are halfway between the lunatic and the ter...,Quotation of the Day
2,nytimes,real,There were Jews in Manhattan before there was ...,Celebrating Sounds Rooted In Gritty but Fertil...


## Cleaning up the content column by removing all the noises

In [19]:
# Portions of this are excerpts from Stack Overflow responses
def remove_special_characters(text): 
    """
    Removes special characters from the text document
    """
    # define the pattern to keep. You can check the regex using this url https://regexr.com/
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
    return re.sub(pat, '', text)

def remove_extra_whitespace_tabs(text): 
    """
    Removes extra whitespaces and remove_extra_whitespace_tabs
    """
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

def remove_digits(text): 
    """
    Remove all digits from the text document
     take string input and return a clean text without numbers.
        Use regex to discard the numbers.
    """
    result = ''.join(i for i in text if not i.isdigit()).lower()
    return ' '.join(result.split())

def remove_newlines(text): 
    """
    Remove newline characters from the text document
    """
    return text.replace('\\n', ' ').replace('\\r', ' ').replace('\n', ' ').replace('\r', ' ').replace('\\', ' ')

#normalize to the NFKD (Normalization Form Compatibility Decomposition) form
#that present in the Unicode standard to remain compatible with other encodings
def remove_accented_chars(text): 
    """
    Removes accented characters from the test
    """
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text


import contractions
contractions.fix(df['content'][10])



#expands contractions found in the text
def expand_contractions(text):


    #contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    #def expand_match(contraction):
    #    match = contraction.group(0)
    #    first_char = match[0]
    #    expanded_contraction = contraction_mapping.get(match)\
    #                            if contraction_mapping.get(match)\
    #                            else contraction_mapping.get(match.lower())
    #    expanded_contraction = first_char+expanded_contraction[1:]
    #    return expanded_contraction

    expanded_text = contractions.fix(text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

# replace punctuation characters with spaces
def replace_punctuation(text):
    filters = string.punctuation + '”' + '“' + '–' 
    translate_dict = dict((c, " ") for c in filters)   
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    return text

# Remove stopwords and remove words with 2 or less characters
def stops_letters(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2 and token not in stopword:
            result.append(token)
            
    return " ".join(result)

#Removes any word that starts with either http or https
def remove_urls(vTEXT):
    #vTEXT = re.sub('http://\S+|https://\S+', '', vTEXT,flags=re.MULTILINE)
    vTEXT = re.sub('http[s]?://\S+', '', vTEXT,flags=re.MULTILINE)
    return(vTEXT)

#Remove words that starts with www
def remove_www(vTEXT):
    vTEXT = re.sub('www\S+', '', vTEXT,flags=re.MULTILINE)
    return(vTEXT)


### Covert Content and Title Fields to Lowercase

In [9]:
%%time
# Apply the functions to the dataframe

# Step 1 - convert the text to lower case
df['content']=df['content'].apply(lambda x: x.lower())
df['title']=df['title'].apply(lambda x: x.lower())

CPU times: user 3.42 s, sys: 52.3 ms, total: 3.47 s
Wall time: 3.47 s


### Remove URLS from Content and Title Fields

In [10]:
%%time
#step 2 - Remove URLS
df['content']=df['content'].apply(remove_urls)
df['title'] = df['title'].apply(remove_urls)

CPU times: user 8.18 s, sys: 16.7 ms, total: 8.2 s
Wall time: 8.2 s


### Remove website www from Content and Title Fields

In [11]:
%%time
#step 3 - Remove www
df['content']=df['content'].apply(remove_www)
df['title'] = df['title'].apply(remove_www)

CPU times: user 7.23 s, sys: 14.6 ms, total: 7.25 s
Wall time: 7.25 s


### Remove special characters from Content and Title Fields

In [12]:
%%time
# Step 4 - remove special charcaters
df['content']=df['content'].apply(remove_special_characters)
df['title'] = df['title'].apply(remove_special_characters)

CPU times: user 38.6 s, sys: 31.6 ms, total: 38.6 s
Wall time: 38.6 s


### Remove whitespace from Content and Title Fields

In [13]:
%%time
#step 5 - Remove whitespaces and tabs
df['content']=df['content'].apply(remove_extra_whitespace_tabs)
df['title'] = df['title'].apply(remove_extra_whitespace_tabs)

CPU times: user 4min 42s, sys: 1.33 s, total: 4min 44s
Wall time: 4min 44s


### Remove website www from Content and Title Fields

In [14]:
%%time
#step 6 - remove newlines and tabs
df['content'] = df['content'].apply(remove_newlines)
df['title'] = df['title'].apply(remove_newlines)

CPU times: user 10.3 s, sys: 12.8 ms, total: 10.3 s
Wall time: 10.3 s


### Remove digits from Content and Title Fields

In [15]:
%%time
# step 7 - Remove digits
df['content']=df['content'].apply(remove_digits)
df['title'] = df['title'].apply(remove_digits)

CPU times: user 6min 17s, sys: 389 ms, total: 6min 17s
Wall time: 6min 17s


### Remove accented characters from Content and Title Fields

In [16]:
%%time
#step 8 - remove accented characters
df['content']=df['content'].apply(remove_accented_chars)
df['title'] = df['title'].apply(remove_accented_chars)

CPU times: user 10.2 s, sys: 536 ms, total: 10.7 s
Wall time: 10.7 s


### Expand Contractions within Content and Title Fields

In [20]:
%%time
#step 9 - Expand contractions
df['content']=df['content'].apply(expand_contractions)

CPU times: user 4min 51s, sys: 1.74 s, total: 4min 53s
Wall time: 4min 54s


### Remove punctions from Content and Title Fields, replace with single space

In [18]:
%%time
#step 10 - Replace punctuations with spaces 
df['content']= df['content'].apply(replace_punctuation)
df['title'] = df['title'].apply(replace_punctuation)

CPU times: user 31.9 s, sys: 6.06 s, total: 38 s
Wall time: 43.9 s


### Remove stop letters from Content and Title Fields

In [21]:
%%time
#step 11 - Remove stopwords, tokenize and remove words with 2 or less characters
df['content']= df['content'].apply(stops_letters)
df['title'] = df['title'].apply(stops_letters)

CPU times: user 27min 58s, sys: 2.68 s, total: 28min 1s
Wall time: 1h 11min 56s


## Affinity Score - Sentiment Analyis - Content Column

In [22]:
# Function to find the affinity score of a list of tweets
afinn = Afinn()

def get_affinity_scores(tweets):
    scores = []
    count = 0
    for t in tweets:
        if len(t) > 0:
            scores.append(afinn.score(t) / len(t))
        else:
            count += 1
            scores.append(0)
    return scores

In [23]:
new_affin = get_affinity_scores(df['content'].tolist())

In [24]:
df['content_affin'] = new_affin

In [25]:
df.to_csv("./FRL_Step_2_news_cleaned_2018_02_13.csv", sep=',',index=False)

In [26]:
df.head()

Unnamed: 0,domain,type,content,title,content_affin
0,nytimes,real,stunning announcement japanese american resear...,stem cell breakthrough,-0.001577
1,nytimes,real,halfway lunatic terrorist line petty crime qae...,quotation day,-0.091603
2,nytimes,real,jews manhattan new york city boat jews arrived...,celebrating sounds rooted gritty fertile new turf,0.009639
3,nytimes,real,editor apple sidesteps billions taxes ieconomy...,apple pays taxes,0.022202
4,nytimes,real,katharine winnifred shergalis daughter mrs edw...,katharine shergalis thomas ewald,-0.001033


## Normalization - Lemmatize the title and content columns

In [31]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
  

def lemmatized_word(text):
    """
    lemmatize the text so as to get its root form 
    """
    word_tokens = nltk.word_tokenize(text)
    lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    return  " ".join(lemmatized_word) #combine the words into a giant string that vectorizer can accept


[nltk_data] Downloading package punkt to /Users/ellemafa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ellemafa/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [32]:
df['content'] = df['content'].progress_apply(lemmatized_word)
df['title'] = df['title'].progress_apply(lemmatized_word)


HBox(children=(FloatProgress(value=0.0, description='progress-bar', max=1789474.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=0.0, description='progress-bar', max=1789474.0, style=ProgressStyle(descrip…

## Create additional features

In [33]:
%%time
# word counts
df['c_word_count'] = df["content"].apply(lambda x: len(str(x).split(" ")))
df['t_word_count'] = df["title"].apply(lambda x: len(str(x).split(" ")))

# Character counts
df['c_character_count'] = df["content"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
df['t_character_count'] = df["title"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

#average word length
df['c_avg_word_length'] = df['c_character_count'] / df['c_word_count']
df['t_avg_word_length'] = df['t_character_count'] / df['t_word_count']


CPU times: user 1min 6s, sys: 1.14 s, total: 1min 7s
Wall time: 1min 1s


In [34]:
df.to_csv("./FRL_Step_2_1_news_cleaned_2018_02_13.csv", sep=',',index=False)

## Identify sentiment in the title

In [38]:
# Add a new plot that shows the distribution of scores 

def sentiment_check (text):
    polarity_score = TextBlob(text).sentiment.polarity
    df['title_sentiment_score'] = polarity_score
    if polarity_score < 0:
        return 'negative'
    elif polarity_score == 0:
        return 'neutral'
    else:
        return 'positive'

In [39]:
%%time
df['title_sentiment_label'] = df['title'].apply(sentiment_check)
df.head(3)

CPU times: user 1h 35min 23s, sys: 15min 40s, total: 1h 51min 4s
Wall time: 2h 59min 43s


Unnamed: 0,domain,type,content,title,content_affin,c_word_count,t_word_count,c_character_count,t_character_count,c_avg_word_length,t_avg_word_length,title_sentiment_label,title_sentiment_score
0,nytimes,real,stunning announcement japanese american resear...,stem cell breakthrough,-0.001577,154,3,1084,20,7.038961,6.666667,neutral,0.0
1,nytimes,real,halfway lunatic terrorist line petty crime qae...,quotation day,-0.091603,18,2,114,12,6.333333,6.0,neutral,0.0
2,nytimes,real,jew manhattan new york city boat jew arrived n...,celebrating sound rooted gritty fertile new turf,0.009639,214,7,1416,42,6.616822,6.0,positive,0.0


In [40]:
df.to_csv("./FRL_Step_2_full_features_news_cleaned_2018_02_13.csv", sep=',',index=False)

## Rename the target column

In [42]:
df = df.rename({'type': 'label'}, axis=1)  


## Rearranged the order of the columns

In [43]:
df = df[['domain','title','content', 'content_affin','c_word_count','t_word_count','c_character_count','t_character_count','c_avg_word_length','t_avg_word_length','title_sentiment_label', 'title_sentiment_score','label']]
df.head(3)

Unnamed: 0,domain,title,content,content_affin,c_word_count,t_word_count,c_character_count,t_character_count,c_avg_word_length,t_avg_word_length,title_sentiment_label,title_sentiment_score,label
0,nytimes,stem cell breakthrough,stunning announcement japanese american resear...,-0.001577,154,3,1084,20,7.038961,6.666667,neutral,0.0,real
1,nytimes,quotation day,halfway lunatic terrorist line petty crime qae...,-0.091603,18,2,114,12,6.333333,6.0,neutral,0.0,real
2,nytimes,celebrating sound rooted gritty fertile new turf,jew manhattan new york city boat jew arrived n...,0.009639,214,7,1416,42,6.616822,6.0,positive,0.0,real


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789474 entries, 0 to 1789473
Data columns (total 13 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   domain                 object 
 1   title                  object 
 2   content                object 
 3   content_affin          float64
 4   c_word_count           int64  
 5   t_word_count           int64  
 6   c_character_count      int64  
 7   t_character_count      int64  
 8   c_avg_word_length      float64
 9   t_avg_word_length      float64
 10  title_sentiment_label  object 
 11  title_sentiment_score  float64
 12  label                  object 
dtypes: float64(4), int64(4), object(5)
memory usage: 177.5+ MB


In [45]:
df.head(4)

Unnamed: 0,domain,title,content,content_affin,c_word_count,t_word_count,c_character_count,t_character_count,c_avg_word_length,t_avg_word_length,title_sentiment_label,title_sentiment_score,label
0,nytimes,stem cell breakthrough,stunning announcement japanese american resear...,-0.001577,154,3,1084,20,7.038961,6.666667,neutral,0.0,real
1,nytimes,quotation day,halfway lunatic terrorist line petty crime qae...,-0.091603,18,2,114,12,6.333333,6.0,neutral,0.0,real
2,nytimes,celebrating sound rooted gritty fertile new turf,jew manhattan new york city boat jew arrived n...,0.009639,214,7,1416,42,6.616822,6.0,positive,0.0,real
3,nytimes,apple pay tax,editor apple sidestep billion tax ieconomy ser...,0.022202,142,3,907,11,6.387324,3.666667,neutral,0.0,real


## Saving the preprocessed dataframe to csv

In [46]:
# path to save the preprocessed csv file
df.to_csv("./FRL_Step_2_full_features_news_cleaned_2018_02_13.csv", sep=',',index=False)