## Produce Preprocessed versions of datasets

In [79]:
import numpy as np
import pandas as pd

In [80]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
11321,1,Citing the near infinite number of celestial b...
401,1,The airline industry is reeling following a sc...
25100,3,Basel III: How The Bank For International Sett...
14851,2,You Wont Believe The Dire Warning Netanyahu Ju...
24790,3,Anonymous Operation: Bureau of Land Mismanagem...


In [81]:
print('Total rows, Total Columns: ' + str(test_df.shape))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
1034,2,[Watch] Cowardly Ferguson Punk Runs His Foul M...
6,1,"There are fans, and then there are super-fans...."
821,2,[Watch] John Bolton on Obama Hiding Russian Nu...
1158,2,"Science Says: Ladies, Just Say No To Hot Guys ..."
1440,2,Americans And The Global Warming Public Relati...


### Text-Cleaning Function

In [82]:
import string 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

def lemmatize_text(text):
    tokenised_text = nltk.word_tokenize(text)
    # Tag with Penn Treebank POS tags
    tagged_text = nltk.pos_tag(tokenised_text)

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word in tagged_text:
        pos_tag = get_wordnet_pos(word[1])
        if pos_tag == '':
            continue
        new_word = lemmatizer.lemmatize(word=word[0], pos=pos_tag)
        lemmatized_words.append(new_word)
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Convert into wordnet compatible POS tags (j, v, n , a)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
# Default one used for training
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

# Testing dataset 1
def preprocess_text_keep_punctuation(text):
    lowered_text = to_lower_case(text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    return initial_stopword_pass

def remove_stopwords_two(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        temp_token = token.lower()
        if temp_token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Testing dataset 2
def preprocess_text_capitalised(text):
    dehyphenated_text = replace_hyphens(text)
    initial_stopword_pass = remove_stopwords_two(dehyphenated_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords_two(depunctuated_text, stopword_set)
    return second_stopword_pass

In [83]:
print('Cleaning training text...')
df[1] = df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

Cleaning training text...
Preprocessing done!


Unnamed: 0,0,1
39222,4,Tennis Venus Williams Wimbledon semi finals
7603,1,Area resident Beatrice Sewell 49 affirmed fait...
18208,2,Scott Baio Loses Live TV Obama HES DUMB HES MU...
9504,1,Padres broadcaster Jerry Coleman took moments ...
7678,1,Two weeks hourly federal minimum wage raised 5...
11502,1,Revealing old fashioned small minded truly loc...
23586,3,ATT Helping Feds Spy EveryoneBy Michael Maharr...
39030,4,Dozens Israeli rabbis added names document cal...
25634,3,Brexit MatrixBy Jon Rappoport EU associated fi...
33810,3,Bulls Eye Smith Wesson Sales Skyrocket Maximum...


In [84]:
print('Cleaning test text...')
test_df[1] = test_df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

Cleaning test text...
Preprocessing done!


Unnamed: 0,0,1
1903,3,mainstream media whose editors reporters locks...
2926,4,Police said Wednesday arrested suspect connect...
2712,4,Results Friday French Open played clay Stade R...
1080,2,TOP 10 Smartest Fox News BABES Hottest Decide
1489,2,Hillary Clinton Secrets Ed Klein List Sending ...
2345,4,dollar edged Thursday breaking decline caused ...
214,1,Get ready mark calendars wont want miss Soon p...
2950,4,better Four Seasons end road said friend Rebec...
2671,4,Center Timofey Mozgov 25 points 11 rebounds le...
2622,4,comes figuring best ensconce Elizabeth Warren ...


In [85]:
df.to_csv('./raw_data/capitalized_fulltrain.csv', header=False, index=False)

In [86]:
test_df.to_csv('./raw_data/capitalized_balancedtest.csv', header=False, index=False)

In [87]:
# Read CSV file in
clean_train_path = './raw_data/capitalized_fulltrain.csv'
clean_test_path = './raw_data/capitalized_balancedtest.csv'
clean_df = pd.read_csv(clean_train_path, header=None)
clean_test_df = pd.read_csv(clean_test_path, header=None)

print('Total rows, Total Columns: ' + str(clean_df.shape))
clean_df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
37119,3,Yanis Varoufakis European Constitution Economi...
1228,1,Citing unbelievable lack depth virtually every...
14217,2,Kid Rock Risks Entire Career Show America Real...
31877,3,7 Things Mainstream Media Want KnowMichael Sny...
26960,3,Cynthia McKinney Remarks UN International Meet...


In [88]:
print('Total rows, Total Columns: ' + str(clean_test_df.shape))
clean_test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
172,1,covert mission designed destroy remains Al Qae...
2233,3,take pride growing sprouts broccoli sprouts be...
347,1,Well could big problem Right middle campaign r...
1092,2,82 Children Recovered Federal Raid Discovered ...
2787,4,Joe Kaeser six months new job chief financial ...


In [89]:
print(clean_test_df[1][2649])

US government seeing hints adversaries targeting military networks remote sabotage head Pentagon recently launched Cyber Command said first public remarks since confirmed last month potential sabotage destruction possible something must treat seriously said Gen Keith Alexander also heads National Security Agency nation largest intelligence agency Department Defense must able operate freely defend resources cyberspace Alexander spoke Thursday 300 people Center Strategic International Studies Washington remarks afterward Alexander said concerned safety computer systems used war zones concern look could happen computer clearly sabotage destruction things yet come said defend systems people able break James Lewis director CSIS Technology Public Policy Program said advanced militaries capable destroying US computer systems true four years ago true Cyber Command deal said Cyber Command launched last month Fort Meade Md created Defense Secretary Robert Gates streamline military capabilities a