## Step 1: Load the dataset

In [None]:
import pandas as pd
import numpy as np

#read in data
df = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

# We only need the Headlines text column from the data
data_text = df[:300000][['headline_text']];
data_text['index'] = data_text.index

documents = data_text

In [None]:
print(len(documents))

In [None]:
documents.head()

## Step 2: Data Preprocessing

* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed - words are reduced to their root form.

In [None]:
#Tokenization

#call necessary libraries
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
np.random.seed(400)

In [None]:
'''
Function to perform the pre processing steps on the entire dataset
'''
#Lemmatize and stem
def lemmatize_stemming(text):
    '''Function to lemmatize and stem the text'''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    '''Function to preprocess the text'''
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            lemmatized_token=lemmatize_stemming(token)
            result.append(lemmatized_token)
    return result


In [None]:
'''
Preview a document after preprocessing
'''
document_num = 4310
doc_sample = documents[documents['index'] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

In [None]:
#Preprocessing all the headlines, saving the list of results as 'processed_docs'
processed_docs =documents['headline_text'].map(preprocess)

In [None]:
#preview first 10 processed documents
processed_docs[:10]

## Step 3: Perform Bag of Words on the Dataset

In [None]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
'''
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary= dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

## Step 4: Perform TF-IDF on the Dataset

In [None]:
'''
Create dictionary and perform tf-idf
'''
model = TfIdfTransformer(dictionary=dictionary)
tfidf_corpus = model.fit_transform(common_corpus)

In [None]:
'''
'''