In [1]:
# Import statements
import numpy as np
import pandas as pd
import os
import sys

In [2]:
# Load the data into pandas frame
data_frame = pd.DataFrame()
for i in range(1,4,1):
    try:
        path = './data/articles'+str(i)+'.csv'
        if os.path.exists(path):
            chunk_list = []
            reader_obj = pd.read_csv(path,chunksize=10000) 
            for chunk in reader_obj:
                chunk_list.append(chunk)    
            data_frame = pd.concat([data_frame,pd.concat(chunk_list).drop(['Unnamed: 0'],axis=1)], ignore_index=True)
    except:
        # handle the file not found error
        print(sys.exc_info())
    
print(data_frame.columns)
print(data_frame.shape)

Index(['id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url',
       'content'],
      dtype='object')
(142570, 9)


## Display a simple text

In [3]:
# Displaying a sample content
sample_content = data_frame['content'][10]
print(sample_content)

With Donald J. Trump about to take control of the White House, it would seem a dark time for the renewable energy industry. After all, Mr. Trump has mocked the science of global warming as a Chinese hoax, threatened to kill a global deal on climate change and promised to restore the coal industry to its former glory. So consider what happened in the middle of December, after investors had had a month to absorb the implications of Mr. Trump’s victory. The federal government opened bidding on a tract of the ocean floor off New York State as a potential site for a huge wind farm. Up, up and away soared the offers  —   interest from the bidders was so fevered that the auction went through 33 rounds and spilled over to a second day. In the end, the winning bidder offered the federal Treasury $42 million, more than twice what the government got in August for oil leases  —   oil leases  —   in the Gulf of Mexico. Who won the bid? None other than Statoil, the Norwegian oil company, which is in

# Tokenization
### Tokenizations is the process of separating each and every small letter of the sentence.

# Removal of Stop Words: 
### In this process we are also eliminating the stop words in order to extract only words 


In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
def tokenization(content):
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(content.lower())
    return token_list
token_list = tokenization(sample_content)
print(token_list)
print(len(token_list))

['with', 'donald', 'j', 'trump', 'about', 'to', 'take', 'control', 'of', 'the', 'white', 'house', 'it', 'would', 'seem', 'a', 'dark', 'time', 'for', 'the', 'renewable', 'energy', 'industry', 'after', 'all', 'mr', 'trump', 'has', 'mocked', 'the', 'science', 'of', 'global', 'warming', 'as', 'a', 'chinese', 'hoax', 'threatened', 'to', 'kill', 'a', 'global', 'deal', 'on', 'climate', 'change', 'and', 'promised', 'to', 'restore', 'the', 'coal', 'industry', 'to', 'its', 'former', 'glory', 'so', 'consider', 'what', 'happened', 'in', 'the', 'middle', 'of', 'december', 'after', 'investors', 'had', 'had', 'a', 'month', 'to', 'absorb', 'the', 'implications', 'of', 'mr', 'trump', 's', 'victory', 'the', 'federal', 'government', 'opened', 'bidding', 'on', 'a', 'tract', 'of', 'the', 'ocean', 'floor', 'off', 'new', 'york', 'state', 'as', 'a', 'potential', 'site', 'for', 'a', 'huge', 'wind', 'farm', 'up', 'up', 'and', 'away', 'soared', 'the', 'offers', 'interest', 'from', 'the', 'bidders', 'was', 'so', 

In [5]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
def remove_non_english(content):
    return " ".join(w for w in nltk.wordpunct_tokenize(content) 
            if w.lower() in words or not w.isalpha())

[nltk_data] Downloading package words to C:\Users\Dharmang
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
from nltk.corpus import stopwords
def stop_words_filter(token_list):
    stopword_set = set(stopwords.words('english'))    
    filtered_tokens = []
    for token in token_list:
        if token not in stopword_set and token.isnumeric() == False:
            filtered_tokens.append(token)
    return filtered_tokens

filtered_tokens = stop_words_filter(token_list)
print(filtered_tokens)

['donald', 'j', 'trump', 'take', 'control', 'white', 'house', 'would', 'seem', 'dark', 'time', 'renewable', 'energy', 'industry', 'mr', 'trump', 'mocked', 'science', 'global', 'warming', 'chinese', 'hoax', 'threatened', 'kill', 'global', 'deal', 'climate', 'change', 'promised', 'restore', 'coal', 'industry', 'former', 'glory', 'consider', 'happened', 'middle', 'december', 'investors', 'month', 'absorb', 'implications', 'mr', 'trump', 'victory', 'federal', 'government', 'opened', 'bidding', 'tract', 'ocean', 'floor', 'new', 'york', 'state', 'potential', 'site', 'huge', 'wind', 'farm', 'away', 'soared', 'offers', 'interest', 'bidders', 'fevered', 'auction', 'went', 'rounds', 'spilled', 'second', 'day', 'end', 'winning', 'bidder', 'offered', 'federal', 'treasury', 'million', 'twice', 'government', 'got', 'august', 'oil', 'leases', 'oil', 'leases', 'gulf', 'mexico', 'bid', 'none', 'statoil', 'norwegian', 'oil', 'company', 'midst', 'major', 'campaign', 'turn', 'big', 'player', 'renewable', 

### As you can see we were able to remove a lot of unnecessary words from the tokens

### Stemming and lemmatization
#### For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

#### The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

#### &emsp;  &emsp; am, are, is $\Rightarrow$ be
#### &emsp;  &emsp; car, cars, car's, cars' $\Rightarrow$ car
#### The result of this mapping of text will be something like:
#### &emsp;  &emsp; the boy's cars are different colors $\Rightarrow$
#### &emsp;  &emsp; the boy car be differ color
#### However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma. Linguistic processing for stemming or lemmatization is often done by an additional plug-in component to the indexing process, and a number of such components exist, both commercial and open-source.

#### For more information refer: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

### NOTE: In the below method if you uncomment the two print lines you will be able to see what this function is doing. This will help to improve the performance of count-based clustering techniques. Also it will reduce the size of sparse matrix.

#### Example there are many examples which are being lemmatized like 
##### eg 1 . rounds => round 
##### eg 2 . leases => lease
##### eg 3 . jobs => job
##### eg 4 . appointees => appointee


In [7]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
def lemmatize_tokens(filtered_tokens):
    lemmatized = []
    lemmatizer = WordNetLemmatizer()
    for token in filtered_tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
        #print("token: " + token)
        #print("Lemmantized "+lemmatizer.lemmatize(token))
    lemmatized_string = ' '.join(lemmatized)
    return lemmatized_string
lemmatized_string = lemmatize_tokens(filtered_tokens)

In [8]:
# Create a data pipeline to process the dataset 
from nltk.tokenize import word_tokenize
def data_preprocessing(data_frame):

    res = pd.DataFrame()
    '''

    # Step 1: Tokenization
    tmp_data_frame = data_frame['content'][0:1000].apply(lambda row:tokenization(row))

    # Step 2: Remove stop words
    tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))

    # Step 3: Make a string
    tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))

    res = pd.concat([res,tmp_data_frame])

    '''
    total_records = len(data_frame)  #50000
    start = 0 
    interval = ((total_records - start) // 50 ) #1000
    # The processing in chunks will reduce the memory load
    for i in range(start,total_records,interval):
        
        print(i)
        # Step 1: Tokenization
        if(i+interval < total_records):
            tmp_data_frame = data_frame['content'][i:i+interval].apply(lambda row:tokenization(row))
        else:
            tmp_data_frame = data_frame['content'][i:total_records].apply(lambda row:tokenization(row))
            
        # Step 2: Remove stop words
        tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))
        
        # Step 3: Make a string
        tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))
        
        #Step 4: Reomve non-english words
        tmp_data_frame = tmp_data_frame.apply(lambda row: remove_non_english(row))
            
        res = pd.concat([res,tmp_data_frame])
        
    res.columns = ['content']
    return res
df = None
df = data_preprocessing(data_frame)
print(df.shape)

0
2851
5702
8553
11404
14255
17106
19957
22808
25659
28510
31361
34212
37063
39914
42765
45616
48467
51318
54169
57020
59871
62722
65573
68424
71275
74126
76977
79828
82679
85530
88381
91232
94083
96934
99785
102636
105487
108338
111189
114040
116891
119742
122593
125444
128295
131146
133997
136848
139699
142550
(142570, 1)


In [9]:
df.to_csv('intermediate_dataframe.csv', sep='\t', encoding='utf-8')

In [10]:
if df is None:
    df = df.read_csv('intermediate_dataframe.csv')

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
sparse_mat = vectorizer.fit_transform(df['content'])
bag_of_words = sparse_mat
word_features = vectorizer.get_feature_names()
print(sparse_mat.shape)    

(142570, 58626)


In [29]:
print(word_features)



In [30]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfTransformer()
sparse_mat = vectorizer.fit_transform(sparse_mat)


In [31]:
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=100)
sparse_mat = pca.fit_transform(sparse_mat)

In [32]:
from matplotlib import pyplot as plt
from sklearn import metrics
from collections import Counter
sse = []
davies_score =[]
'''
list_k = list(range(2, 20))
for k in list_k:
    kmeans = KMeans(n_clusters=k, random_state = 0).fit(sparse_mat)
    l = kmeans.labels_
    result = metrics.davies_bouldin_score(sparse_mat, l)
    davies_score.append(result)
    sse.append(kmeans.inertia_)

'''
kmeans = KMeans(20,random_state=0).fit(sparse_mat)
labels = kmeans.labels_


In [37]:
# Plot sse against k
'''
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');
plt.savefig('results/20ClustersSSE.png')
'''

"\nplt.figure(figsize=(6, 6))\nplt.plot(list_k, sse, '-o')\nplt.xlabel(r'Number of clusters *k*')\nplt.ylabel('Sum of squared distance');\nplt.savefig('results/20ClustersSSE.png')\n"

In [38]:
'''
plt.figure(figsize=(6, 6))
plt.plot(list_k, davies_score, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Davies Scores');
plt.savefig('results/DaviesScore_20Clusters.png')
'''

"\nplt.figure(figsize=(6, 6))\nplt.plot(list_k, davies_score, '-o')\nplt.xlabel(r'Number of clusters *k*')\nplt.ylabel('Davies Scores');\nplt.savefig('results/DaviesScore_20Clusters.png')\n"

In [39]:
#print("Optimal Cluster size: "+ str(list_k[np.argmin(davies_score)]))

In [172]:
#find the topic name
clusters = []
for i in range(0,20,1):
    cluster_tuple = np.where(labels == i)
    clusters.append(cluster_tuple[0])

print(clusters[4])


[  1109   2653   2767 ... 141385 142018 142523]
13thderek


In [149]:
sum_words = bag_of_words.sum(axis=0)

In [150]:
print(sum_words)

[[1 2 2 ... 1 1 1]]


In [151]:
vec = CountVectorizer().fit(df['content'])

In [152]:
word_freq = {}
for word, idx in vec.vocabulary_.items():
    word_freq[word] = sum_words[0, idx]

In [153]:
word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1],reverse=True)}
word_list = list(word_freq.keys())

In [200]:
print(len(word_freq))


58626


In [201]:
import spacy


284460417


In [217]:
# Try to find the proper nouns 
nlp = spacy.load("en_core_web_sm")
from collections import Counter
tags = []


for i in range(len(clusters)):
    # find the frequecy of each word in that cluster 
    temp = ""
    cnt = 0
    for doc_id in clusters[i]:
        temp = ' '.join(df['content'][doc_id])
    
    print(tokens_of_cluster)    
    '''
    nouns = []
    for token in doc:
            if token.pos_ == 'NOUN':
                tokens_of_cluster.append(token.text)
    '''
        

set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
set()
