In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

In [3]:
#loading the dataset
articles = pd.read_csv('articles1.csv')
print(articles)
print(articles.describe())

       Unnamed: 0     id                                              title  \
0               0  17283  House Republicans Fret About Winning Their Hea...   
1               1  17284  Rift Between Officers and Residents as Killing...   
2               2  17285  Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...   
3               3  17286  Among Deaths in 2016, a Heavy Toll in Pop Musi...   
4               4  17287  Kim Jong-un Says North Korea Is Preparing to T...   
5               5  17288  Sick With a Cold, Queen Elizabeth Misses New Y...   
6               6  17289  Taiwan’s President Accuses China of Renewed In...   
7               7  17290  After ‘The Biggest Loser,’ Their Bodies Fought...   
8               8  17291  First, a Mixtape. Then a Romance. - The New Yo...   
9               9  17292  Calling on Angels While Enduring the Trials of...   
10             10  17293  Weak Federal Powers Could Limit Trump’s Climat...   
11             11  17294  Can Carbon Capture Technol

In [4]:
#we only need the id and content of the articles
dataset = articles[['id','content']].copy()
print(dataset)

          id                                            content
0      17283  WASHINGTON  —   Congressional Republicans have...
1      17284  After the bullet shells get counted, the blood...
2      17285  When Walt Disney’s “Bambi” opened in 1942, cri...
3      17286  Death may be the great equalizer, but it isn’t...
4      17287  SEOUL, South Korea  —   North Korea’s leader, ...
5      17288  LONDON  —   Queen Elizabeth II, who has been b...
6      17289  BEIJING  —   President Tsai   of Taiwan sharpl...
7      17290  Danny Cahill stood, slightly dazed, in a blizz...
8      17291  Just how   is Hillary Kerr, the    founder of ...
9      17292  Angels are everywhere in the Muñiz family’s ap...
10     17293  With Donald J. Trump about to take control of ...
11     17294  THOMPSONS, Tex.  —   Can one of the most promi...
12     17295  WEST PALM BEACH, Fla.  —   When   Donald J. Tr...
13     17296  This article is part of a series aimed at help...
14     17297  It’s the season for family

In [5]:
#initialising the tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(list(dataset['content']))

In [10]:
print(X)

  (0, 164662)	0.017461889978376093
  (0, 34552)	0.08039074274790155
  (0, 127563)	0.32217931045218556
  (0, 105792)	0.022667294215878556
  (0, 54256)	0.025593243008011437
  (0, 33512)	0.020545081896424486
  (0, 67868)	0.24188552046847497
  (0, 27110)	0.17399812047582613
  (0, 87459)	0.061418382177486935
  (0, 108860)	0.0865282155566942
  (0, 6132)	0.2431094260559799
  (0, 166780)	0.02195233287240728
  (0, 74140)	0.03380414960464999
  (0, 156192)	0.11532791512874031
  (0, 30642)	0.05858490261761543
  (0, 90551)	0.0238373634843474
  (0, 40763)	0.028350389454790775
  (0, 52282)	0.10479693339134626
  (0, 22910)	0.16755594706693613
  (0, 146944)	0.06383218928869207
  (0, 29113)	0.02901817550428822
  (0, 14095)	0.05607018438204655
  (0, 143126)	0.02817225591675304
  (0, 19615)	0.033556597132458774
  (0, 45171)	0.02631454397194243
  :	:
  (49999, 44193)	0.06037839233014294
  (49999, 102118)	0.039003783052110046
  (49999, 90228)	0.03574598580737414
  (49999, 110656)	0.05461313122917939
  (4999

In [7]:
#initializing KMeans
model = KMeans(n_clusters=5, max_iter=3000)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=3000,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names()

for i in range(5):
    print('_______Cluster', i, '_______')
    for ind in order_centroids[i, :10]:
        print(terms[ind])


_______Cluster 0 _______
said
people
new
like
just
year
company
news
time
twitter
_______Cluster 1 _______
police
said
isis
officers
attack
people
city
killed
state
syria
_______Cluster 2 _______
trump
said
president
donald
republican
cruz
house
campaign
obama
people
_______Cluster 3 _______
mr
trump
said
president
mrs
clinton
ms
obama
campaign
republican
_______Cluster 4 _______
clinton
hillary
trump
sanders
campaign
said
democratic
state
presidential
voters
