# Data Cleaning and Preprocessing

## Objective: 
Make sure that the data is in a suitable format for a classifier and that there is no missing values.

# 1 Import Data

## 1.1 Import libaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn import base
from nltk import tokenize
from nltk.stem.porter import PorterStemmer
import gensim

## 1.2 Load data

In [2]:
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# 2 Inspect The Data

## 2.1 Inspect The Heads Of The Dataframes

In [None]:
fake.head()

In [None]:
real.head()

## 2.2 See If The Classes Are Balanced

In [None]:
plt.bar(["Fake", "Real"], [len(fake), len(real)], color=["red", "blue"])
plt.show()

The classes seem to be well balanced.

## 2.2 Examine The Distribution Of The "Subject" Values

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))

ax1.set_title("Fake")
ax1.bar(fake.subject.unique(), fake.subject.value_counts())

ax2.set_title("Real")
ax2.bar(real.subject.unique(), real.subject.value_counts())

for tick in ax1.get_xticklabels() + ax2.get_xticklabels():
    tick.set_rotation(45)

plt.show()

It seems the subjects are unique to each dataframe making it impossible to gain meaningful insights from them.

## 2.3 Inspect The Contents Of The "Text" Columns

In [None]:
# fake
for article in fake.text.iloc[:5]:
    print(article, "\n\n\n")

In the data I can see that there are twitter handels and hyper-links. These should be removed because these things can be give aways and also people/groups may be inconsistent with their behavior. For example I would not want my model to give bad results because an influential person used to be reliable, but then started spreading fake news.

In [None]:
# real
for article in real.text.iloc[:5]:
    print(article, "\n\n\n")

The data contains contains information on the publisher (notice "reuters"). This may be a give away on the class of an article and should be removed.

## 2.4 Inspect The Content Of The "Title" Columns

In [None]:
for n in range(6):
    print(fake.title.iloc[n], "\n")
    print(real.title.iloc[n], "\n")

It appears all of the fake news titles may begin with a space.

In [None]:
for n in range(6):
    print("'", fake.title.iloc[n][0], "'\n")
    print("'", real.title.iloc[n][0], "'\n")

The fake titles do begin with a space.

## 2.5 Inspect The "Date" Columns

In [None]:
real.date[0]

In [None]:
fake.date[0]

Here I can see the last character in the real dates is whitespace. Next I will see if there are any values that can not be turned into a date (year) and if their are I will change them to a NaN value.

In [None]:
def deturmine_date(date, n):
    try:
        return int(date[n:])
    except:
        return np.nan
    

In [None]:
fake.date = fake.date.map(lambda x: deturmine_date(x, -2))
real.date = real.date.map(lambda x: deturmine_date(x, -3))

Now I will check to see how many NaNs there are.

In [None]:
fake.date.isna().sum()

Next I will drop the NaNs.

In [None]:
fake = fake.dropna()

In [None]:
real.date.isna().sum()

Now I will plot histograms to see how the dates are distributed. Hopefully the real and fake news are around the same date range.

In [None]:
import seaborn as sns
plt.hist(fake.date, color="red", alpha=0.4, bins=3, label="fake")
plt.hist(real.date, color="blue", alpha=0.4, bins=3, label="real")
plt.legend()
plt.title("Distribution Of Dates Seperated By Class")
plt.show()

# 3 Combine Data

## 3.1 Create A Column To Distinguish Class

In [None]:
real["real_news"] = 1
fake["real_news"] = 0

## 3.2 Concatinate DataFrames

In [None]:
data = pd.concat([real, fake], axis=0)
data.head()

# 4 Clean Data

## 4.1 Drop Date And Subject Columns

In [None]:
data.drop("date", axis=1, inplace=True)
data.drop("subject", axis=1, inplace=True)

## 4.2 Lower Case Title and Text

In [None]:
data.title = data.title.map(lambda x: x.lower())
data.text = data.text.map(lambda x: x.lower())

## 4.3 Remove Publishers

I will remove the publisher information by searching to see if there is a hyphen in the first 150 characters of text and if there is I will remove all characters around and before that hyphen.

In [None]:
def remove_publisher(text):
    if "reuters" in text:
        index = text[:150].find("-") + 2
        text = text[index:]
    return text

In [None]:
data.text = data.text.map(lambda x: remove_publisher(x))

In [None]:
real.text.iloc[0]

In [None]:
data[data.real_news==1].text.iloc[0]

It seems to have worked successfully

## 4.4 Remove Twitter Handels

To remove the twitter handels I will split the text into individual words and remove any words with an "@" symbol.

In [None]:
def remove_twitter(text):
    words = text.split(" ")
    non_twitter_words = []
    for word in words:
        if "@" not in word:
            non_twitter_words.append(word)
    return " ".join(non_twitter_words)

In [None]:
data.text = data.text.map(lambda x: remove_twitter(x))

In [None]:
fake.text.iloc[0]

In [None]:
data[data.real_news==0].text.iloc[0]

## 4.5 Check For And Deal With Empty Text And Title Values

In [None]:
data.title = data.title.map(lambda x: np.nan if x.strip() == "" else x)

In [None]:
data.text = data.text.map(lambda x: np.nan if x.strip() == "" else x)

In [None]:
data.isna().sum()

In [None]:
data = data.dropna()

## 4.6 Combine Title And Text

In [None]:
data.text = data.title + data.text

In [None]:
data.drop("title", axis=1, inplace=True)

## 4.7 Remove Punctuation

Here I will remove punctuation from each string of text by looping over a string containing all punctuation marks and for each iteration I will replace that charachter with an empty string.

In [None]:
for c in string.punctuation:
    data.text = data.text.map(lambda x: x.replace(c, ""))

## 4.8 Tokenize

In [None]:
data["tokens"] = data.text.map(lambda x: tokenize.word_tokenize(x))

## 4.9 Remove Stop Words

In [None]:
def remove_stopwords(tokens):
    stop = stopwords.words("english")
    clean_words = []
    for word in tokens:
        if word not in stop:
            clean_words.append(word)
            
    return clean_words

In [None]:
data.tokens = data.tokens.map(lambda x: remove_stopwords(x))

## 4.10 Stem Tokens

In [None]:
porter = PorterStemmer()
data.tokens = data.tokens.map(lambda x: [porter.stem(word) for word in x])

## 5 Train / Val / Test Split

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
train, val = train_test_split(train, random_state=42)

## 7 Save Data

In [None]:
train.to_csv("train_clean.csv", index=False)

In [None]:
val.to_csv("val_clean.csv", index=False)

In [None]:
test.to_csv("test_clean.csv", index=False)