In [1]:
# Import statements
import numpy as np
import pandas as pd
import os
import sys
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import spacy
nltk.download('words')
from gensim.summarization.summarizer import summarize 
from gensim.summarization import keywords


[nltk_data] Downloading package words to C:\Users\Dharmang
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Load the data into pandas frame
data_frame = pd.DataFrame()
nlp = spacy.load("en_core_web_sm")
for i in range(1,4,1):
    try:
        path = './data/articles'+str(i)+'.csv'
        if os.path.exists(path):
            chunk_list = []
            reader_obj = pd.read_csv(path,chunksize=10000) 
            for chunk in reader_obj:
                chunk_list.append(chunk)    
            data_frame = pd.concat([data_frame,pd.concat(chunk_list).drop(['Unnamed: 0'],axis=1)], ignore_index=True)
    except:
        # handle the file not found error
        print(sys.exc_info())
    
print(data_frame.columns)
print(data_frame.shape)

Index(['id', 'title', 'publication', 'author', 'date', 'year', 'month', 'url',
       'content'],
      dtype='object')
(142570, 9)


## Display a simple text

In [3]:
# Displaying a sample content
sample_content = data_frame['content'][0]
print(len(sample_content))

5607


# Tokenization
### Tokenizations is the process of separating each and every small letter of the sentence.

# Removal of Stop Words: 
### In this process we are also eliminating the stop words in order to extract only words 


In [4]:
def tokenization(content):
    tokenizer = RegexpTokenizer(r'\w+')
    token_list = tokenizer.tokenize(content.lower())
    return token_list
token_list = tokenization(sample_content)
print(len(token_list))

880


In [5]:
from nltk.corpus import stopwords
from nltk.corpus import wordnet
def stop_words_filter(token_list):
    stopword_set = set(stopwords.words('english'))    
    filtered_tokens = []
    for token in token_list:
        if (token not in stopword_set 
        and token.isnumeric() == False 
        and wordnet.synsets(token) != [] 
        and token.isalpha()
        and len(token)>1):
            filtered_tokens.append(token)
            
    return filtered_tokens

print(len(token_list))
filtered_tokens = stop_words_filter(token_list)
print(len(filtered_tokens))

880
471


### As you can see we were able to remove a lot of unnecessary words from the tokens

### Stemming and lemmatization
#### For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

#### The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For instance:

#### &emsp;  &emsp; am, are, is $\Rightarrow$ be
#### &emsp;  &emsp; car, cars, car's, cars' $\Rightarrow$ car
#### The result of this mapping of text will be something like:
#### &emsp;  &emsp; the boy's cars are different colors $\Rightarrow$
#### &emsp;  &emsp; the boy car be differ color
#### However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma . If confronted with the token saw, stemming might return just s, whereas lemmatization would attempt to return either see or saw depending on whether the use of the token was as a verb or a noun. The two may also differ in that stemming most commonly collapses derivationally related words, whereas lemmatization commonly only collapses the different inflectional forms of a lemma. Linguistic processing for stemming or lemmatization is often done by an additional plug-in component to the indexing process, and a number of such components exist, both commercial and open-source.

#### For more information refer: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

### NOTE: In the below method if you uncomment the two print lines you will be able to see what this function is doing. This will help to improve the performance of count-based clustering techniques. Also it will reduce the size of sparse matrix.

#### Example there are many examples which are being lemmatized like 
##### eg 1 . rounds => round 
##### eg 2 . leases => lease
##### eg 3 . jobs => job
##### eg 4 . appointees => appointee


In [6]:
# Lemmatization 
def lemmatize_tokens(filtered_tokens):
    lemmatized = []
    lemmatizer = WordNetLemmatizer()
    for token in filtered_tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
        #print("token: " + token)
        #print("Lemmantized "+lemmatizer.lemmatize(token))
    return lemmatized
lemmatized = lemmatize_tokens(filtered_tokens)

In [7]:
def summarize_text(data):
    summ = summarize(data, ratio = 0.05) 

In [8]:
def find_nouns(content,isTokens=True):
    if isTokens:
        doc = nlp(' '.join(content))
    else:
        doc = nlp(content)
    tokens = []
    for index in range(len(doc)):
        if doc[index].pos_ == 'NOUN' or doc[index].pos_ == 'PROPN':
            tokens.append(doc[index].text)
    return tokens
nouns = find_nouns(lemmatized)
print(nouns)

['washington', 'congressional', 'republican', 'new', 'fear', 'health', 'care', 'lawsuit', 'administration', 'trump', 'administration', 'branch', 'suit', 'challenge', 'administration', 'authority', 'dollar', 'health', 'insurance', 'subsidy', 'american', 'handing', 'house', 'republican', 'victory', 'issue', 'loss', 'subsidy', 'health', 'care', 'program', 'implode', 'people', 'health', 'insurance', 'republican', 'replacement', 'lead', 'chaos', 'insurance', 'market', 'backlash', 'republican', 'control', 'government', 'stave', 'outcome', 'republican', 'position', 'sum', 'health', 'care', 'law', 'voter', 'end', 'law', 'year', 'twist', 'trump', 'administration', 'branch', 'prerogative', 'choose', 'republican', 'ally', 'house', 'central', 'question', 'dispute', 'eager', 'avoid', 'pileup', 'republican', 'capitol', 'hill', 'trump', 'transition', 'team', 'lawsuit', 'election', 'put', 'limbo', 'february', 'united', 'state', 'court', 'appeal', 'district', 'columbia', 'circuit', 'divulge', 'strategy

In [28]:
# Create a data pipeline to process the dataset 

def data_preprocessing(data_frame):

    res = pd.DataFrame(columns=['id','content'])
    '''

    # Step 1: Tokenization
    tmp_data_frame = data_frame['content'][0:1000].apply(lambda row:tokenization(row))

    # Step 2: Remove stop words
    tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))

    # Step 3: Make a string
    tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))

    res = pd.concat([res,tmp_data_frame])

    '''
    total_records = len(data_frame)  #50000
    start = 0 
    interval = ((total_records - start) // 50 ) #1000
    # The processing in chunks will reduce the memory load
    for i in range(start,total_records,interval):
        print(i)
        # Step 1: Shorten the text
        if(i+interval < total_records):
            tmp_data_frame = data_frame['content'][i:i+interval].apply(lambda row:row[:150])
        else:
            tmp_data_frame = data_frame['content'][i:total_records].apply(lambda row:row[:150])
        
        # Step 2: find Nouns and Pronouns
        tmp_data_frame = tmp_data_frame.apply(lambda row:find_nouns(row))
        
        # Step 3: Tokenize the text
        tmp_data_frame = tmp_data_frame.apply(lambda row:tokenization(row))
        
        # Step 4: Remove stop words
        tmp_data_frame = tmp_data_frame.apply(lambda row: stop_words_filter(row))
        
        # Step 5: Lemmitize tokens
        tmp_data_frame = tmp_data_frame.apply(lambda row: lemmatize_tokens(row))
                
        # Step 6: Make a string
        tmp_data_frame = tmp_data_frame.apply(lambda row: ' '.join(row))

        #print(tmp_data_frame.to_frame())
        res = pd.concat([res,tmp_data_frame.to_frame()])
        
    res.columns = ['doc_id','content']
    res['doc_id'] = range(0,total_records)
    return res

df = data_preprocessing(data_frame)


0


AttributeError: 'list' object has no attribute 'lower'

In [11]:
df.to_csv('intermediate_dataframe.csv', sep=',', encoding='utf-8',index=False)

In [12]:
df = pd.read_csv('intermediate_dataframe.csv')

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
sparse_mat = vectorizer.fit_transform(df['content'].apply(lambda x: np.str_(x)))
bag_of_words = sparse_mat
word_features = vectorizer.get_feature_names()
print(sparse_mat.shape)    

(142570, 32029)


In [14]:
print(sparse_mat)

  (0, 31147)	1
  (0, 5747)	1
  (0, 23463)	1
  (0, 18771)	1
  (0, 10446)	1
  (0, 5350)	1
  (0, 12953)	1
  (0, 4118)	1
  (0, 15951)	1
  (0, 363)	1
  (0, 17679)	1
  (1, 3647)	1
  (1, 25363)	1
  (1, 11845)	1
  (1, 6236)	1
  (1, 2922)	1
  (1, 8681)	1
  (1, 30971)	1
  (1, 3990)	1
  (1, 3734)	1
  (1, 20469)	1
  (1, 20393)	1
  (1, 31545)	1
  (1, 24998)	1
  (1, 6482)	1
  :	:
  (142568, 12150)	1
  (142568, 16451)	1
  (142568, 30466)	1
  (142568, 17629)	1
  (142568, 30372)	1
  (142568, 10484)	1
  (142568, 2151)	1
  (142568, 21847)	1
  (142568, 14071)	1
  (142568, 23121)	1
  (142569, 19406)	1
  (142569, 10745)	1
  (142569, 31887)	1
  (142569, 584)	1
  (142569, 3025)	1
  (142569, 28665)	1
  (142569, 18571)	1
  (142569, 18631)	1
  (142569, 17088)	1
  (142569, 23504)	1
  (142569, 12423)	1
  (142569, 5310)	1
  (142569, 25464)	1
  (142569, 25524)	1
  (142569, 22893)	1


In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer()
sparse_mat = vectorizer.fit_transform(df['content'].apply(lambda x: np.str_(x)))


In [16]:
print(sparse_mat)
word_features = vectorizer.get_feature_names()


  (0, 17679)	0.31733068766883377
  (0, 363)	0.2861069135025976
  (0, 15951)	0.35504476391642836
  (0, 4118)	0.3229931454598027
  (0, 12953)	0.3048624260763353
  (0, 5350)	0.2944498605403032
  (0, 10446)	0.3790106499719746
  (0, 18771)	0.19166000289408724
  (0, 23463)	0.21743586085268013
  (0, 5747)	0.3540623685833741
  (0, 31147)	0.23323753808648456
  (1, 5239)	0.22289481572704153
  (1, 12086)	0.21367200037936315
  (1, 24681)	0.20803979449563142
  (1, 6482)	0.19778754162123005
  (1, 24998)	0.17446285866402936
  (1, 31545)	0.23510451101650157
  (1, 20393)	0.2672835677747418
  (1, 20469)	0.12850395121404842
  (1, 3734)	0.24544329635226422
  (1, 3990)	0.30173824393281884
  (1, 30971)	0.3701943686846686
  (1, 8681)	0.2723164458119942
  (1, 2922)	0.2312460517975244
  (1, 6236)	0.27835084003259675
  :	:
  (142568, 2151)	0.3133381152352469
  (142568, 10484)	0.32206390997016526
  (142568, 30372)	0.2353047940668551
  (142568, 17629)	0.26268120776376275
  (142568, 30466)	0.2281840039080983
  (14

In [21]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=200)
sparse_mat = svd.fit_transform(sparse_mat)

In [22]:
from matplotlib import pyplot as plt
from sklearn import metrics
from collections import Counter
sse = []
davies_score =[]
'''
list_k = list(range(2, 20))
for k in list_k:
    kmeans = KMeans(n_clusters=k, random_state = 0).fit(sparse_mat)
    l = kmeans.labels_
    result = metrics.davies_bouldin_score(sparse_mat, l)
    davies_score.append(result)
    sse.append(kmeans.inertia_)
'''
kmeans = KMeans(20,init='k-means++', max_iter=100, n_init=1,random_state=0).fit(sparse_mat)
labels = kmeans.labels_

In [23]:
result = metrics.davies_bouldin_score(sparse_mat,labels)
print(result)

2.989149438856667


In [24]:
import pickle
filename = 'kmeans.pickle'
pickle.dump(kmeans, open(filename, 'wb'))

In [25]:
# load the model from disk
kmeans = pickle.load(open(filename, 'rb'))
print(result)

2.989149438856667


In [27]:
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
nouns = find_nouns(terms)

print(order_centroids)

for i in range(20):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :50]:
        if(terms[])
        print(' %s' % terms[ind]),
    print

print("\n")



Top terms per cluster:
[[17  1 14 ...  3 51 13]
 [42 41 92 ... 44 78 45]
 [ 1  2 19 ...  6  3  4]
 ...
 [ 2 13 15 ... 20 10 19]
 [ 3  1  0 ...  2  4  5]
 [ 1 21 13 ... 19  4 15]]
Cluster 0:
 abc
 aaa
 abbey
 abbot
 abortion
 abatement
 abandon
 abolition
 abortifacient
 aa
Cluster 1:
 abomination
 abolitionist
 accentuate
 aaa
 abysmal
 acceleration
 abound
 abyss
 academic
 accompany
Cluster 2:
 aaa
 aahs
 abdication
 aa
 abbe
 abstruse
 ab
 abode
 abhorrent
 abdomen
Cluster 3:
 aahs
 aaa
 aarp
 aaron
 abate
 abandonment
 abdication
 aa
 abdomen
 aba
Cluster 4:
 aboriginal
 abnormality
 aaa
 absurdity
 ability
 acapulco
 abortifacient
 acacia
 abortive
 abomination
Cluster 5:
 aa
 aarp
 abandonment
 aaron
 abate
 actin
 adage
 acrimonious
 abet
 acclaim
Cluster 6:
 accustomed
 accumulation
 aaa
 accreditation
 accent
 acquirer
 accountability
 accra
 acapulco
 accessory
Cluster 7:
 abound
 aaa
 abscess
 abortionist
 abaya
 abdication
 abhors
 abortifacient
 absolute
 absence
Cluster 8

In [29]:
print(sparse_mat)
print(len(word_features))

[[ 1.07344369e-02  1.57601380e-01 -1.56058372e-02 ...  3.21213624e-02
  -3.73841500e-02 -2.13398307e-02]
 [ 2.15175786e-03  2.29269456e-02  1.12625651e-02 ...  2.17262805e-03
   1.15333696e-02  3.26789865e-03]
 [ 8.61380978e-04  1.08273697e-02  1.63818347e-03 ... -5.60145157e-05
   1.68493318e-03 -5.63649570e-03]
 ...
 [ 3.12336727e-03  4.34743464e-02  2.16035155e-03 ...  6.38126359e-02
   4.27860801e-03  3.11875877e-02]
 [ 4.59887526e-02  4.88799498e-02  1.43258030e-02 ... -3.59895575e-03
  -4.34719733e-02 -3.14120768e-02]
 [ 4.60258284e-03  6.18189004e-02  2.19000390e-02 ...  7.86091462e-03
  -4.21572858e-02 -4.21243906e-03]]
32029


In [30]:
# Plot sse against k
'''
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');
plt.savefig('results/20ClustersSSE.png')
'''

"\nplt.figure(figsize=(6, 6))\nplt.plot(list_k, sse, '-o')\nplt.xlabel(r'Number of clusters *k*')\nplt.ylabel('Sum of squared distance');\nplt.savefig('results/20ClustersSSE.png')\n"

In [31]:
'''
plt.figure(figsize=(6, 6))
plt.plot(list_k, davies_score, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Davies Scores');
plt.savefig('results/DaviesScore_20Clusters.png')
'''

"\nplt.figure(figsize=(6, 6))\nplt.plot(list_k, davies_score, '-o')\nplt.xlabel(r'Number of clusters *k*')\nplt.ylabel('Davies Scores');\nplt.savefig('results/DaviesScore_20Clusters.png')\n"

In [32]:
#print("Optimal Cluster size: "+ str(list_k[np.argmin(davies_score)]))

In [33]:
#find the topic name
clusters = []
for i in range(0,20,1):
    cluster_tuple = np.where(labels == i)
    clusters.append(cluster_tuple[0])

In [34]:
print(clusters[4])

[    17     31    198 ... 142440 142477 142562]


In [35]:
sum_words = bag_of_words.sum(axis=0)

In [36]:
print(sum_words)

[[4 3 2 ... 2 8 1]]


In [37]:
#vec = CountVectorizer().fit(df['content'])

In [38]:
#word_freq = {}
#for word, idx in vec.vocabulary_.items():
#    word_freq[word] = sum_words[0, idx]

In [39]:
#word_freq = {k: v for k, v in sorted(word_freq.items(), key=lambda item: item[1],reverse=True)}
#word_list = list(word_freq.keys())

In [40]:
#print(len(word_freq))

In [41]:
import spacy


In [42]:
clusters[0]


array([     5,     20,     82, ..., 142508, 142543, 142544], dtype=int64)

In [None]:
# Try to find the proper nouns 
nlp = spacy.load("en_core_web_sm")
from collections import Counter
import nltk
tags = []
for i in range(len(clusters)): 
    print(i)
    all_nouns = []
    # for each document in the cluster
    for doc_id in range(len(clusters[i])):
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(str(df['content'][doc_id]))
        #tokens = tokenization(df['content'][doc_id])
        # select first 50 and last 50 words from preprocessed data
        # This is the assumption that first 200 characters of news articles will cover the 
        # abstract idea of the complete passage
        
        doc = nlp(' '.join(tokens))
        nouns = []
        for index in range(len(doc)):
            if doc[index].pos_ == 'NOUN' or doc[index].pos_ == 'PROPN':
                nouns.append(doc[index].text)
        nouns = Counter(nouns) 
        nouns = {k: v for k, v in sorted(nouns.items(), key=lambda item: item[1],reverse=True)}
        i = 0
        all_nouns += nouns
            
    all_nouns = Counter(all_nouns) 
    all_nouns = {k: v for k, v in sorted(all_nouns.items(), key=lambda item: item[1])}
    i = 0
    top_20 = [] 
    for key in all_nouns.keys():
        if(i<20):
            top_20.append(key)
            i+=1
        else:
            break
    print(top_20)
    tags.append(top_20)

0
['trump', 'washington', 'president', 'year', 'state', 'new', 'week', 'day', 'united', 'tuesday', 'time', 'monday', 'thursday', 'wednesday', 'friday', 'city', 'email', 'people', 'morning', 'york']
1
['trump', 'president', 'washington', 'year', 'state', 'new', 'united', 'day', 'time', 'week', 'tuesday', 'morning', 'people', 'wednesday', 'friday', 'city', 'monday', 'house', 'sunday', 'york']
2


In [None]:
pd.to_csv('tags.csv')

In [None]:
df