In [3]:
# Import statements
import numpy as np
import pandas as pd
import os
import sys

In [4]:
# Load the data into pandas frame
path = './data/articles1.csv'
if os.path.exists(path):
    chunk_list = []
    reader_obj = pd.read_csv(path,chunksize=10000) 
    try:
        for chunk in reader_obj:
            chunk_list.append(chunk)
        data_frame = pd.concat(chunk_list).drop(['Unnamed: 0'],axis=1)
        print(data_frame.columns)
    except:
        # handle the file not found error
        print(sys.exc_info())

Index(['id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url',
       'content'],
      dtype='object')


## Display a simple text

In [5]:
# Displaying a sample content
sample_content = data_frame['content'][10]
print(sample_content)

With Donald J. Trump about to take control of the White House, it would seem a dark time for the renewable energy industry. After all, Mr. Trump has mocked the science of global warming as a Chinese hoax, threatened to kill a global deal on climate change and promised to restore the coal industry to its former glory. So consider what happened in the middle of December, after investors had had a month to absorb the implications of Mr. Trump’s victory. The federal government opened bidding on a tract of the ocean floor off New York State as a potential site for a huge wind farm. Up, up and away soared the offers  —   interest from the bidders was so fevered that the auction went through 33 rounds and spilled over to a second day. In the end, the winning bidder offered the federal Treasury $42 million, more than twice what the government got in August for oil leases  —   oil leases  —   in the Gulf of Mexico. Who won the bid? None other than Statoil, the Norwegian oil company, which is in

# Tokenization
### Tokenizations is the process of separating each and every small letter of the sentence.

# Removal of Stop Words: 
### In this process we are also eliminating the stop words in order to extract only words 


In [6]:

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def tokenization(content):
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(content.lower())
    return token_list
token_list = tokenization(sample_content)
print(len(token_list))

1184


In [36]:
from nltk.corpus import stopwords
def stop_words_filter(token_list):
    stopword_set = set(stopwords.words('english'))
    filtered_tokens = []
    for token in token_list:
        if token not in stopword_set and token.isnumeric() == False:
            filtered_tokens.append(token)
    return filtered_tokens
filtered_tokens = stop_words_filter(token_list)
print(len(filtered_tokens))

646


### As you can see we were able to remove a lot of unnecessary words from the tokens

### Stemming and lemmatization
#### For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

#### The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

#### &emsp;  &emsp; am, are, is $\Rightarrow$ be
#### &emsp;  &emsp; car, cars, car's, cars' $\Rightarrow$ car
#### The result of this mapping of text will be something like:
#### &emsp;  &emsp; the boy's cars are different colors $\Rightarrow$
#### &emsp;  &emsp; the boy car be differ color
#### However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma. Linguistic processing for stemming or lemmatization is often done by an additional plug-in component to the indexing process, and a number of such components exist, both commercial and open-source.

#### For more information refer: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

### NOTE: In the below method if you uncomment the two print lines you will be able to see what this function is doing. This will help to improve the performance of count-based clustering techniques. Also it will reduce the size of sparse matrix.

#### Example there are many examples which are being lemmatized like 
##### eg 1 . rounds => round 
##### eg 2 . leases => lease
##### eg 3 . jobs => job
##### eg 4 . appointees => appointee


In [37]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
def lemmatize_tokens(filtered_tokens):
    lemmatized = []
    lemmatizer = WordNetLemmatizer()
    for token in filtered_tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
        #print("token: " + token)
        #print("Lemmantized "+lemmatizer.lemmatize(token))
    lemmatized_string = ' '.join(lemmatized)
    return lemmatized_string
lemmatized_string = lemmatize_tokens(filtered_tokens)

In [38]:
# Create a data pipeline to process the dataset 
from nltk.tokenize import word_tokenize
def data_preprocessing(data_frame):
    
    res = pd.DataFrame()
    # Step 1: Tokenization
    tmp_data_frame = data_frame['content'][0:1000].apply(lambda row:tokenization(row))

    # Step 2: Remove stop words
    tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))

    # Step 3: Make a string
    tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))

    res = pd.concat([res,tmp_data_frame])

    '''
    total_records = len(data_frame)  #50000
    start = 0 
    interval = ((total_records - start) // 50 ) #1000
    # The processing in chunks will reduce the memory load
    for i in range(start,total_records,interval):
        # Step 1: Tokenization
        tmp_data_frame = data_frame['content'][i:i+interval].apply(lambda row:tokenization(row))
        
        # Step 2: Remove stop words
        tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))
        
        # Step 3: Make a string
        tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))
            
        res = pd.concat([res,tmp_data_frame])
    '''
        
    
    res.columns = ['content']
    return res

df = data_preprocessing(data_frame)
print(df)

                                               content
0    washington congressional republicans new fear ...
1    bullet shells get counted blood dries votive c...
2    walt disney bambi opened critics praised spare...
3    death may great equalizer necessarily evenhand...
4    seoul south korea north korea leader kim said ...
..                                                 ...
995  many world winter athletes preparing season ch...
996  mother jones named magazine year tuesday findi...
997  marissa alexander longer wears ankle monitor f...
998  support among american high school students fi...
999  damien chazelle writer director musical la la ...

[1000 rows x 1 columns]


In [40]:
from sklearn.feature_extraction.text import CountVectorizer
def create_sparse_mat(corpus):
    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(corpus)
    word_features = vectorizer.get_feature_names()
    print(word_features)
    return x

sparse_mat = create_sparse_mat(df['content'])
print(sparse_mat.shape)    


(1000, 37698)
