# Detecting Fake News - Cleaning Data

### Importing libraries

In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import seaborn as sns
pd.set_option('display.max_colwidth', 100)

### Import data, assign values & concatenate

In [3]:
data_fake = pd.read_csv("/Users/garethmoen/Documents/Data Science/Portfolio/Fake News Detection/dataset/Fake.csv")
data_fake['label'] = 'FALSE'
data_true = pd.read_csv("/Users/garethmoen/Documents/Data Science/Portfolio/Fake News Detection/dataset/True.csv")
data_true['label'] = 'TRUE'

### Shuffle concatenated data, reindex, cut & write to csv

In [3]:
data = pd.concat([data_fake, data_true]) # merge dataframes
data = data.sample(frac=1).reset_index(drop=True)
data = data.drop(columns=['title', 'subject', 'date']) # remove unnecessary columns
data = data[0:2000]
#data.to_csv("/Users/garethmoen/Documents/GitHub/Projects/Fake News Detection/dataset_reduced.csv")

### View the data

In [4]:
data.head(10)

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The White House on Friday said it will rely on international experts to e...,True
1,BANGKOK (Reuters) - Armed insurgents stopped and torched a Bangkok-bound passenger bus on a high...,True
2,A Princeton professor gives viewers of FOX News a shocking demonstration that proves how easily ...,False
3,"As we all remember last year, Pope Francis made a historic visit and was beloved by pretty much ...",False
4,(Reuters) - Donald Trump’s presidential campaign manager was arrested and charged with misdemean...,True
5,By Vin ArmaniHillary Clinton continues to blame Russia for the email leak even though it does...,False
6,Donald Trump held a press conference today to clarify every since bit of accusations made that h...,False
7,ISLAMABAD (Reuters) - Pakistani Finance Minister Ishaq Dar pleaded not guilty on Wednesday to ow...,True
8,https://www.youtube.com/watch?time_continue=2&v=IjWClQcKhD8,False
9,WASHINGTON (Reuters) - U.S. Senate Republican leader Mitch McConnell said on Sunday that he is o...,True


### Look at a single text

In [5]:
data.at[6, 'text'] # access a single value by index (row) and label (column name)

'Donald Trump held a press conference today to clarify every since bit of accusations made that he pocketed over six million dollars meant for veterans. Basically, Trump shut the liberal media down. It was awesome! What was even better was watching a New Hampshire lawmaker rip into the liberal press like I ve never seen! '

### Look at a second text

In [6]:
data.iloc[:, 1] # Select rows and all columns

0         TRUE
1         TRUE
2        FALSE
3        FALSE
4         TRUE
         ...  
44893    FALSE
44894    FALSE
44895    FALSE
44896    FALSE
44897    FALSE
Name: label, Length: 44898, dtype: object

### Shape of data

In [7]:
print("Dataset has {} rows and {} columns".format(len(data), len(data.columns)))

Dataset has 44898 rows and 2 columns


### Number of True & False values

In [8]:
print("Out of {} rows, {} are true, {} are false".format(len(data),
                                                       len(data[data['label']=='TRUE']),
                                                       len(data[data['label']=='FALSE'])))

Out of 44898 rows, 21417 are true, 23481 are false


### Missing values & short texts

In [9]:
print("Number of texts with less than 300 characters is: {}".format(sum(data['text'].str.len() <= 300)))
#sum(data['text'].str.len() <= 300)

Number of texts with less than 300 characters is: 2162


### Remove empty or short texts (less than 300 characters), shuffle texts & reset index

In [10]:
data = data.loc[data['text'].str.len() >= 300]
data = data.sample(frac=1).reset_index(drop=True)

In [11]:
data.head(20)

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - Thousands of demonstrators are expected to turn out in Washington next we...,True
1,Is anyone else getting sick and tired of hearing all of the baseless lies being told about Trump...,False
2,"The pro-gun crowd loves guns first, and respect last. At least, that seems to be the case with w...",False
3,"WASHINGTON (Reuters) - Republican presidential candidate Donald Trump, a golf resort developer w...",True
4,MOSCOW (Reuters) - Russian oil major Rosneft has received court summons for its Chief Executive ...,True
5,CLEVELAND (Reuters) - People seeking a deeper understanding of Donald Trump’s economic policy ca...,True
6,WASHINGTON (Reuters) - The Senate voted overwhelmingly on Wednesday to confirm Dr. Robert Califf...,True
7,LONDON (Reuters) - Britain will be in the “front seat” to negotiate a new trade deal with the in...,True
8,"Marco Rubio might have been flying high after the results from Iowa, but he well and truly crash...",False
9,LIMA (Reuters) - Peru’s President Pedro Pablo Kuczynski could end up the surprise winner of an a...,True


In [12]:
data.at[2, 'text'] # access a single value by index (row) and label (column name)

'The pro-gun crowd loves guns first, and respect last. At least, that seems to be the case with whoever made a truly horrible meme out of an old black-and-white war photo. The photo depicts troops executing a couple of people in the back as they kneel in front of a mass grave. The troops  are just obeying orders,  and the dead in the mass grave are  citizens obeying gun laws. This is the meme:Image via TwitterWe don t know who created this, but we don t have enough middle fingers to express how we feel about them. The piece of shit behind this is no doubt someone who believes the lie that, had the Jewish citizens of Germany and elsewhere just had guns, the Holocaust wouldn t have happened.The photo in the meme is from the German-occupied Soviet Union, and depicts one of the mobile execution squads known as the Einsatzgr ppen. They were tasked with mass executions of Jews, communists, and anyone else that the Nazis decided weren t suitable to living their empire and master race. Tens of

### Recheck new data shape

In [13]:
print("Dataset has {} rows and {} columns".format(len(data), len(data.columns)))

Dataset has 42748 rows and 2 columns


### Start wrangling text

In [14]:
data.columns # now I have everything I need to start wrangling the texts

Index(['text', 'label'], dtype='object')

### Make text lowercase & reorder columns

In [15]:
data['text_lower'] = data['text'].str.lower()
data = data[['text','text_lower','label']] # organise columns in the order I want
data.head()

Unnamed: 0,text,text_lower,label
0,WASHINGTON (Reuters) - Thousands of demonstrators are expected to turn out in Washington next we...,washington (reuters) - thousands of demonstrators are expected to turn out in washington next we...,True
1,Is anyone else getting sick and tired of hearing all of the baseless lies being told about Trump...,is anyone else getting sick and tired of hearing all of the baseless lies being told about trump...,False
2,"The pro-gun crowd loves guns first, and respect last. At least, that seems to be the case with w...","the pro-gun crowd loves guns first, and respect last. at least, that seems to be the case with w...",False
3,"WASHINGTON (Reuters) - Republican presidential candidate Donald Trump, a golf resort developer w...","washington (reuters) - republican presidential candidate donald trump, a golf resort developer w...",True
4,MOSCOW (Reuters) - Russian oil major Rosneft has received court summons for its Chief Executive ...,moscow (reuters) - russian oil major rosneft has received court summons for its chief executive ...,True


### Remove non-letter characters

In [16]:
data['text_clean'] = data['text_lower'].map(lambda x: re.sub('^.+?[(Reuters)]+[\s]+\-', "", x)) # removes everything before the first '-' symbol 
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('watch the full.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('featured image.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('writing by.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('via:.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('this artical originally.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('read more.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('wfb.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('this version of the story.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('photo by.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])', "", x)) # URLs
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('@([A-Za-z0-9_]+)', "", x)) # Twitter usernames
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('cnn politics.+$', "", x))
data['text_clean'] = data['text_clean'].map(lambda x: re.sub('daily mail.+$', "", x))

In [17]:
data.head()

Unnamed: 0,text,text_lower,label,text_clean
0,WASHINGTON (Reuters) - Thousands of demonstrators are expected to turn out in Washington next we...,washington (reuters) - thousands of demonstrators are expected to turn out in washington next we...,True,thousands of demonstrators are expected to turn out in washington next week for protests aiming...
1,Is anyone else getting sick and tired of hearing all of the baseless lies being told about Trump...,is anyone else getting sick and tired of hearing all of the baseless lies being told about trump...,False,is anyone else getting sick and tired of hearing all of the baseless lies being told about trump...
2,"The pro-gun crowd loves guns first, and respect last. At least, that seems to be the case with w...","the pro-gun crowd loves guns first, and respect last. at least, that seems to be the case with w...",False,"the pro-gun crowd loves guns first, and respect last. at least, that seems to be the case with w..."
3,"WASHINGTON (Reuters) - Republican presidential candidate Donald Trump, a golf resort developer w...","washington (reuters) - republican presidential candidate donald trump, a golf resort developer w...",True,"republican presidential candidate donald trump, a golf resort developer who has railed against ..."
4,MOSCOW (Reuters) - Russian oil major Rosneft has received court summons for its Chief Executive ...,moscow (reuters) - russian oil major rosneft has received court summons for its chief executive ...,True,russian oil major rosneft has received court summons for its chief executive igor sechin to app...


### Clean up text

In [18]:
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['text_nostop'] = data['text_clean'].apply(lambda x: clean_text(x))

data.head()

Unnamed: 0,text,text_lower,label,text_clean,text_nostop
0,WASHINGTON (Reuters) - Thousands of demonstrators are expected to turn out in Washington next we...,washington (reuters) - thousands of demonstrators are expected to turn out in washington next we...,True,thousands of demonstrators are expected to turn out in washington next week for protests aiming...,"[, thousands, demonstrators, expected, turn, washington, next, week, protests, aiming, shut, ina..."
1,Is anyone else getting sick and tired of hearing all of the baseless lies being told about Trump...,is anyone else getting sick and tired of hearing all of the baseless lies being told about trump...,False,is anyone else getting sick and tired of hearing all of the baseless lies being told about trump...,"[anyone, else, getting, sick, tired, hearing, baseless, lies, told, trump, zero, evidence, trump..."
2,"The pro-gun crowd loves guns first, and respect last. At least, that seems to be the case with w...","the pro-gun crowd loves guns first, and respect last. at least, that seems to be the case with w...",False,"the pro-gun crowd loves guns first, and respect last. at least, that seems to be the case with w...","[progun, crowd, loves, guns, first, respect, last, least, seems, case, whoever, made, truly, hor..."
3,"WASHINGTON (Reuters) - Republican presidential candidate Donald Trump, a golf resort developer w...","washington (reuters) - republican presidential candidate donald trump, a golf resort developer w...",True,"republican presidential candidate donald trump, a golf resort developer who has railed against ...","[, republican, presidential, candidate, donald, trump, golf, resort, developer, railed, mexico, ..."
4,MOSCOW (Reuters) - Russian oil major Rosneft has received court summons for its Chief Executive ...,moscow (reuters) - russian oil major rosneft has received court summons for its chief executive ...,True,russian oil major rosneft has received court summons for its chief executive igor sechin to app...,"[, russian, oil, major, rosneft, received, court, summons, chief, executive, igor, sechin, appea..."


### Lemmatize words

In [19]:
# def lemmatizing(tokenized_text):
#     text = [wn.lemmatize(word) for word in tokenized_text]
#     return text

# data['text_lemmatized'] = data['text_nostop'].apply(lambda x: lemmatizing(x))

# data.head(10)

### Stem text

In [21]:
# Stem text if prefereable

# import nltk # if not already done

ps = nltk.PorterStemmer()

# dir(ps) # looks at the attributes of the PorterStemmer

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text] # returns the stemmed word in the list
    return text

data['text_stemmed'] = data['text_nostop'].apply(lambda x: stemming(x))

In [22]:
data.to_csv("/Users/garethmoen/Documents/GitHub/Projects/Fake News Detection/dataset_reduced.csv")