In [1]:
# loading dataset

import pandas as pd

data = pd.read_csv("abcnews-date-text.csv",error_bad_lines=False,usecols =["headline_text"])
data.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [2]:
# getting the info about the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186018 entries, 0 to 1186017
Data columns (total 1 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   headline_text  1186018 non-null  object
dtypes: object(1)
memory usage: 9.0+ MB


In [3]:
# deleting duplicate headlines in dataset

data = data.drop_duplicates('headline_text')

In [4]:
# removing meaningless words and vectorizing the dataset

from sklearn.feature_extraction import text

punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
desc = data['headline_text'].values
vectorizer = text.TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(desc)

In [5]:
# getting feature names i.e. keywords from vectored mappings

word_features = vectorizer.get_feature_names()
print(len(word_features))
print(word_features[10000:10100])

101629
['bands', 'bandstravaganza', 'bandt', 'bandts', 'banducci', 'banduk', 'bandung', 'bandwagon', 'bandwidth', 'bandy', 'bandyup', 'bane', 'banerjee', 'banerji', 'banesto', 'banfield', 'banfields', 'bang', 'banga', 'bangaldesh', 'bangaldeshi', 'bangalore', 'bangalow', 'bangarang', 'bangarra', 'bangarras', 'bangaru', 'bangas', 'bangay', 'banged', 'banger', 'bangers', 'banging', 'bangka', 'bangkok', 'bangkoks', 'bangla', 'bangladesh', 'bangladeshi', 'bangladeshis', 'bangladeshs', 'bangladsh', 'bangles', 'bango', 'bangor', 'bangs', 'bangtail', 'banh', 'banham', 'bani', 'banish', 'banished', 'banishes', 'banishing', 'banishment', 'banivanua', 'baniyala', 'baniyas', 'banjawarn', 'banjima', 'banjo', 'banjos', 'banjup', 'bank', 'banka', 'bankcard', 'bankcards', 'bankcruptcy', 'banked', 'banker', 'bankers', 'bankgok', 'bankholders', 'bankia', 'banking', 'banknote', 'banknotes', 'bankrobbers', 'bankroll', 'bankrolled', 'bankrolling', 'bankrolls', 'bankrupcy', 'bankrupt', 'bankruptcies', 'ban

In [6]:
# creating tokenizer and stemmer

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

In [7]:
# defining the tokenization and stemming function

def tokenize (text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [8]:
# concentrating the dataset by stemming the tokenized words in dataset

vectorizer2 = text.TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X2 = vectorizer2.fit_transform(desc)
word_features2 = vectorizer2.get_feature_names()
print(word_features2[901:1000]) 



['timor', 'tip', 'titl', 'told', 'toll', 'toni', 'tough', 'tour', 'tourism', 'tourist', 'town', 'townsvill', 'track', 'trade', 'train', 'transport', 'travel', 'treatment', 'tree', 'tri', 'trial', 'tribut', 'troop', 'truck', 'trump', 'tsunami', 'turn', 'turnbul', 'tv', 'uk', 'uni', 'union', 'unit', 'univers', 'unveil', 'upgrad', 'urg', 'use', 'valley', 'veteran', 'vic', 'victim', 'victori', 'victoria', 'victorian', 'video', 'violenc', 'visit', 'volunt', 'vote', 'vow', 'vs', 'wa', 'wait', 'walk', 'wall', 'wallabi', 'want', 'war', 'warn', 'warrior', 'wast', 'watch', 'water', 'way', 'weather', 'week', 'weekend', 'welcom', 'welfar', 'west', 'western', 'whale', 'whi', 'white', 'wife', 'wild', 'william', 'win', 'wind', 'wine', 'winner', 'wit', 'withdraw', 'woe', 'woman', 'women', 'wont', 'work', 'worker', 'world', 'worri', 'year', 'yo', 'young', 'youth', 'zealand', 'zimbabw', 'zone']


In [9]:
# fitting data into kmeans

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 5, n_init = 2, n_jobs = 4) 
kmeans.fit(X2)



KMeans(n_clusters=5, n_init=2, n_jobs=4)

In [10]:
# We look at 5 the clusters generated by k-means.
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(word_features2[word] for word in centroid))

0 : polic, man, new, say, charg, australia, court, kill, govt, report, death, face, crash, nsw, attack, fund, murder, year, sydney, urg, water, wa, interview, jail, hit
1 : council, plan, consid, rate, fund, urg, seek, new, merger, water, land, develop, reject, say, mayor, vote, elect, chang, rise, citi, meet, park, push, want, local
2 : australian, warn, issu, open, polic, prompt, threat, south, share, year, dollar, spark, danger, new, market, risk, resid, farmer, health, flood, rise, weather, china, say, driver
3 : plan, govt, new, water, say, group, develop, unveil, hous, chang, park, govern, labor, expans, public, resid, health, urg, centr, green, murray, opposit, reveal, shire, reject
4 : win, award, cup, titl, open, gold, stage, world, final, tour, elect, australia, lead, seri, aussi, claim, second, australian, grand, england, big, race, record, m, battl
