# Paths

In [1]:
EXAMPLE_ROOT = 'Example04'
NEWSGROUP_JSON_PATH = f'{EXAMPLE_ROOT}/newsgroups.json'
NEWSGROUP_CONTENT_RAW_TXT_PATH = f'{EXAMPLE_ROOT}/01_newsgroups_content_raw.txt'
NEWSGROUP_LINES_RAW_TXT_PATH = f'{EXAMPLE_ROOT}/02_newsgroups_lines_raw.txt'
NEWSGROUP_CLEAN_TXT_PATH = f'{EXAMPLE_ROOT}/03_newsgroups_clean.txt'
NEWSGROUP_LEMINIZED_PATH = f'{EXAMPLE_ROOT}/04_leminized.txt'
NEWSGROUP_LEMINIZED_NO_STOP_PATH = f'{EXAMPLE_ROOT}/05_leminized_no_stop.txt'

# Import Dataset

In [2]:
import pandas as pd

df = pd.read_json(NEWSGROUP_JSON_PATH)

df.head(15)

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc
10000,From: a207706@moe.dseg.ti.com (Robert Loper)\n...,7,rec.autos
10001,From: kimman@magnus.acs.ohio-state.edu (Kim Ri...,6,misc.forsale
10002,From: kwilson@casbah.acns.nwu.edu (Kirtley Wil...,2,comp.os.ms-windows.misc
10003,Subject: Re: Don't more innocents die without ...,0,alt.atheism
10004,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,0,alt.atheism


# Data Preprocessing

In [3]:
import re, string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words('english'))

def sentence_to_words(sentences):
    return [sent.split(' ') for sent in sentences]
        
def dump_to_file(filePath, data):
    with open(filePath, 'w') as f:
        for item in data:
            f.write("%s\n" % item.encode("utf-8"))

def leminize_data_words(data_words):
    return [lemmatizer.lemmatize(word) for word in data_words]

def filter_out_stop_words(data_words):
    return [word for word in data_words if word not in stopWords]

def remove_empty_words(data_words):
    return [word for word in data_words if word]

def join(data_words_sentences):
    return [' '.join(sent) for sent in data_words_sentences]

# Convert to list
data = df.content.values.tolist()

dump_to_file(NEWSGROUP_CONTENT_RAW_TXT_PATH, data)

# Remove unrequired entries
data = [re.sub('Re:', '', sent, flags = re.I) for sent in data]
data = [re.sub('From:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Organization:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Nntp-Posting-Host:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Distribution:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Article-I.D.:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Keywords:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Expires:.*', '', sent, flags = re.I) for sent in data]
data = [re.sub('Subject:', '', sent, flags = re.I) for sent in data]
data = [re.sub('Lines: [0-9]+', '', sent, flags = re.I) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

dump_to_file(NEWSGROUP_LINES_RAW_TXT_PATH, data)

# Replace unwanted characters
data = [re.sub('[^A-Za-z]', ' ', sent) for sent in data]

# transform to lowercase
data = [sent.lower() for sent in data]

# remove double spaces
data = [re.sub("\ +", " ", sent) for sent in data]

# trim sentences
data = [sent.strip() for sent in data]

dump_to_file(NEWSGROUP_CLEAN_TXT_PATH, data)

# convert sentences to words
data_words_sentences = sentence_to_words(data)

# perform leminization
data_words_sentences = [leminize_data_words(data_words) for data_words in data_words_sentences]

dump_to_file(NEWSGROUP_LEMINIZED_PATH, join(data_words_sentences))

# filter out stop words
data_words_sentences = [filter_out_stop_words(data_words) for data_words in data_words_sentences]

# remove empty words
data_words_sentences = [remove_empty_words(data_words) for data_words in data_words_sentences]

dump_to_file(NEWSGROUP_LEMINIZED_NO_STOP_PATH, join(data_words_sentences))