## Produce Preprocessed versions of datasets

In [68]:
import numpy as np
import pandas as pd

In [69]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
41638,4,President and ruling Kuomintang (KMT) Chairman...
33934,3,7 Ways to Rekindle the Quest for TruthGaye Lev...
5085,1,Drawing thunderous applause and roars of appro...
20604,2,Supporter Falls Nearly Falls Asleep During Hil...
10554,1,According to a report released Monday by the s...


In [70]:
print('Total rows, Total Columns: ' + str(test_df.shape))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
2276,4,The newest U.S. texting champion has a message...
2897,4,A federal jury on Monday convicted an Indian-b...
482,1,Youve probably seen a lot of eerily similar ce...
1682,3,"Reishi mushrooms, also known as Ganoderma luc..."
309,1,The 164-year-old New York Times has never been...


### Text-Cleaning Function

In [71]:
import string 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

def lemmatize_text(text):
    tokenised_text = nltk.word_tokenize(text)
    # Tag with Penn Treebank POS tags
    tagged_text = nltk.pos_tag(tokenised_text)

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word in tagged_text:
        pos_tag = get_wordnet_pos(word[1])
        if pos_tag == '':
            continue
        new_word = lemmatizer.lemmatize(word=word[0], pos=pos_tag)
        lemmatized_words.append(new_word)
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Convert into wordnet compatible POS tags (j, v, n , a)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
# Default one used for training
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

# Testing dataset 1
def preprocess_text_keep_punctuation(text):
    lowered_text = to_lower_case(text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    return initial_stopword_pass

def remove_stopwords_two(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        temp_token = token.lower()
        if temp_token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Testing dataset 2
def preprocess_text_capitalised(text):
    dehyphenated_text = replace_hyphens(text)
    initial_stopword_pass = remove_stopwords_two(dehyphenated_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords_two(depunctuated_text, stopword_set)
    return second_stopword_pass

In [72]:
print('Cleaning training text...')
df[1] = df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

Cleaning training text...
Preprocessing done!


Unnamed: 0,0,1
38295,3,NSA Blackmailing Corporate Media Youtube
43849,4,The Panamanian cop directions restaurant confu...
35050,3,BLM Attempts 90000 Acre Land Grab Texas Ranche...
14134,2,BREAKING Court Makes Decision On Whether Under...
47100,4,It beyond astonishing John Boehner real chance...
44565,4,A second seminar Taiwan Indonesia collaboratio...
10227,1,Saying product allow efficient thorough cleani...
7692,1,After improperly buttoning Gap wool blend peac...
40973,4,A fuel line tanker loading bunker oil barge ru...
35397,3,School Prison Pipeline Complete New Law Makes ...


In [73]:
print('Cleaning test text...')
test_df[1] = test_df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

Cleaning test text...
Preprocessing done!


Unnamed: 0,0,1
2585,4,Royal Ahold NV Dutch owner Stop Shop Giant sup...
436,1,The following letter released today Lloyd Blan...
2529,4,A French drug company hoping offer American wo...
2685,4,He never charged case sent father prison thous...
556,1,Cristiano Ronaldo may well worlds favorite foo...
2568,4,The United States Russia may sign new nuclear ...
268,1,In observers calling largest merger ever two s...
2429,4,The great hope months ago recovery summer econ...
1671,3,As rapid devaluation American dollar rapidly a...
1523,3,While many people still need convincing choosi...


In [74]:
df.to_csv('./raw_data/capitalized_fulltrain.csv', header=False, index=False)

In [75]:
test_df.to_csv('./raw_data/capitalized_balancedtest.csv', header=False, index=False)

In [76]:
# Read CSV file in
clean_train_path = './raw_data/capitalized_fulltrain.csv'
clean_test_path = './raw_data/capitalized_balancedtest.csv'
clean_df = pd.read_csv(clean_train_path, header=None)
clean_test_df = pd.read_csv(clean_test_path, header=None)

print('Total rows, Total Columns: ' + str(clean_df.shape))
clean_df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
13925,1,Gathering members international science commun...
22578,3,How One Man Cracked Code Healing WaterKacper P...
40489,4,Dwight Howard scored 21 points JJ Redick added...
13627,1,This week deputy FDA commissioner announced Am...
3304,1,Explaining preventative measure totally unnece...


In [77]:
print('Total rows, Total Columns: ' + str(clean_test_df.shape))
clean_test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
1434,2,Watch Tsunami Illegals A Government Sanctioned...
374,1,In event US government monitoring conversation...
991,2,Bush Shows Up For Jury Duty Like Americans sho...
2283,4,ACME Mich General Motors Chief Executive Offic...
1030,2,Shoulder To Shoulder Same Lies New Attack Go B...


In [78]:
print(clean_test_df[1][2649])

The US government seeing hints adversaries targeting military networks remote sabotage head Pentagon recently launched Cyber Command said first public remarks since confirmed last month The potential sabotage destruction possible something must treat seriously said Gen Keith Alexander also heads National Security Agency nation largest intelligence agency Our Department Defense must able operate freely defend resources cyberspace Alexander spoke Thursday 300 people Center Strategic International Studies Washington In remarks afterward Alexander said concerned safety computer systems used war zones The concern I look could happen computer clearly sabotage destruction things yet come said If defend systems people able break James Lewis director CSIS Technology Public Policy Program said advanced militaries capable destroying US computer systems That true four years ago true Cyber Command deal said The Cyber Command launched last month Fort Meade Md created Defense Secretary Robert Gates s