## Produce cleaned versions of datasets

In [12]:
import string  
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [13]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
36563,3,Man Pays Parking Fines in CoinsYoutube
26510,3,Pupillometry The Cutting Edge of Mind ControlB...
5279,1,Local resident Carl Sutton was recognized by F...
38833,3,Is There Going To Be A Stock Market Crash In T...
26222,3,"Make a 55-Gallon Compost Tumbler Fast, Cheap a..."


In [14]:
print('Total rows, Total Columns: ' + str(test_df.shape))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
845,2,More Moolah Wasted As NIH Kills National Child...
1727,3,With preparations being made for the massive ...
997,2,EXPOSED! U.N. AGENDA: INFILTRATE AMERICA With ...
1771,3,Nichole Rolfe (formerly Bruff) was expelled f...
725,1,Republican Presidential candidate Mitt Romneys...


### Text-Cleaning Function

In [15]:
# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

In [16]:
print('Cleaning training text...')
df[1] = df[1].map(preprocess_text)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

Cleaning training text...
Preprocessing done!


Unnamed: 0,0,1
16424,2,obama plants watchdog congress undermine trump...
31257,3,president obama signed dark act lawby heather ...
22689,3,dont let trade away internet freedomseff corpo...
10598,1,citing devastatingly uncomfortable makes peopl...
29271,3,dead disabled displaced destroyed democracy de...
44033,4,looking like matinee idol bygone era ryan moor...
47485,4,government information office gio rejected acc...
22052,3,house bill introduced us govt stop funding arm...
23863,3,theres reason called courts janet c phelan unl...
40388,4,britain limped longest ever recession fourth q...


In [17]:
print('Cleaning test text...')
test_df[1] = test_df[1].map(preprocess_text)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

Cleaning test text...
Preprocessing done!


Unnamed: 0,0,1
1576,3,tweet patients irritable bowel syndrome ibs to...
2860,4,supreme court justices signaled tuesday likely...
1222,2,hillary comey says charges online poll virgini...
2884,4,netherlands becomes first team qualify round 1...
642,1,serial junkies might want sit one whether your...
2421,4,ask david plouffe democrats recover electoral ...
1069,2,hands dont shoot true per witness multiple sit...
429,1,billionaire koch brothers approved controversi...
2829,4,record high water levels put capacity china ma...
884,2,roger stone organizing cleveland protests stop...


In [20]:
df.to_csv('./cleaned_data/clean_fulltrain.csv', header=False, index=False)

In [21]:
test_df.to_csv('./cleaned_data/clean_balancedtest.csv', header=False, index=False)

In [25]:
# Read CSV file in
clean_train_path = './cleaned_data/clean_fulltrain.csv'
clean_test_path = './cleaned_data/clean_balancedtest.csv'
clean_df = pd.read_csv(clean_train_path, header=None)
clean_test_df = pd.read_csv(clean_test_path, header=None)

print('Total rows, Total Columns: ' + str(clean_df.shape))
clean_df.sample(5) # Random sample values to see

Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
41063,4,icelandic farmers living areas near erupting v...
38807,3,iarpa us government spies us vimeo youtubesusa...
3884,1,dr andrew lassiter st luke medical center emer...
18466,2,everyone distracted super tuesday obama secret...
38286,3,3 tools building decentralized communities fre...


In [26]:
print('Total rows, Total Columns: ' + str(clean_test_df.shape))
clean_test_df.sample(5) # Random sample values to see

Total rows, Total Columns: (3000, 2)


Unnamed: 0,0,1
2648,4,barnes noble virginia tysons corner center one...
2706,4,following editorial appeared thursday washingt...
404,1,aftermath sudden withdrawal presidential race ...
1231,2,us officer arrested aiding isis veteran washin...
150,1,construction sochi much track finished time 20...


In [27]:
print(clean_test_df[1][2649])

us government seeing hints adversaries targeting military networks remote sabotage head pentagon recently launched cyber command said first public remarks since confirmed last month potential sabotage destruction possible something must treat seriously said gen keith alexander also heads national security agency nation largest intelligence agency department defense must able operate freely defend resources cyberspace alexander spoke thursday 300 people center strategic international studies washington remarks afterward alexander said concerned safety computer systems used war zones concern look could happen computer clearly sabotage destruction things yet come said defend systems people able break james lewis director csis technology public policy program said advanced militaries capable destroying us computer systems true four years ago true cyber command deal said cyber command launched last month fort meade md created defense secretary robert gates streamline military capabilities a