In [1]:
# Data Source from Kaggle 
# https://www.kaggle.com/socialmedianews/how-news-appears-on-social-media

In [2]:
import numpy as np
import pandas as pd

In [3]:
import os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
    print(f)

kaggle_news.csv
Untitled.ipynb


In [4]:
df = pd.read_csv("kaggle_news.csv")

In [5]:
df.shape

(1200, 6)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Date,Time,Score,Number of Comments
0,0,British man charged after US gamer is shot by ...,2017-04-12,11:24:14,17472,3339
1,1,Ukraine ledger reveals $1.2 million in off-the...,2017-04-12,13:26:48,4798,415
2,2,United passenger threatened with handcuffs to ...,2017-04-12,02:29:44,31802,3243
3,3,Oregon lawmakers vote to shield marijuana user...,2017-04-12,09:10:29,1988,102
4,4,DeVos Undoes Obama Student Loan Protections,2017-04-12,00:42:00,26072,4459


In [7]:
unique_title = df.Title.value_counts().to_frame()
unique_title = unique_title.reset_index()

In [8]:
unique_title.columns =['title','value']
unique_title.head()

Unnamed: 0,title,value
0,US intercepts 2 Russian bombers off Alaskas coast,5
1,Detroit doctor charged with female genital mut...,4
2,United CEO says no one will be fired for dragg...,4
3,Smoke pot in Oregon? Your name now protected f...,4
4,Indian firms Zika virus vaccine 100% efficient...,4


In [9]:
title = unique_title.title
title.head(5)

0    US intercepts 2 Russian bombers off Alaskas coast
1    Detroit doctor charged with female genital mut...
2    United CEO says no one will be fired for dragg...
3    Smoke pot in Oregon? Your name now protected f...
4    Indian firms Zika virus vaccine 100% efficient...
Name: title, dtype: object

In [10]:
# Import Lib for Text Cleaning

import re
from nltk.stem import PorterStemmer, SnowballStemmer
stemmer = SnowballStemmer('english')

In [11]:
# Clean Text function

def clean_text(text, stemming=False):
    text_clean = re.sub('\[', '', text)
    text_clean = re.sub('\]', '', text_clean)
    if stemming:
        text_clean = ' '.join([stemmer.stem(t) for t in text_clean.split()])
    else:
        text_clean = ' '.join(text_clean.split())
    return text_clean.lower()

In [12]:
title.fillna('', inplace=True)
title_clean = title.map(lambda x: clean_text(x, stemming=True))

In [13]:
title_clean.head(5)

0       us intercept 2 russian bomber off alaska coast
1    detroit doctor charg with femal genit mutil or...
2      unit ceo say no one will be fire for drag incid
3    smoke pot in oregon? your name now protect fro...
4    indian firm zika virus vaccin 100% effici in a...
Name: title, dtype: object

In [14]:
# Import TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1))

In [15]:
title_tfidf = tfidf_model.fit_transform(title_clean)
title_tfidf

<741x2659 sparse matrix of type '<class 'numpy.float64'>'
	with 6106 stored elements in Compressed Sparse Row format>

In [16]:
# Import KMeans

from sklearn.cluster import KMeans
cluster = KMeans(n_clusters=7)

In [17]:
cluster_predict = cluster.fit_predict(title_tfidf)
unique_title['cluster'] = cluster_predict

In [19]:
unique_title.head()

Unnamed: 0,title,value,cluster
0,US intercepts 2 Russian bombers off Alaskas coast,5,2
1,Detroit doctor charged with female genital mut...,4,6
2,United CEO says no one will be fired for dragg...,4,3
3,Smoke pot in Oregon? Your name now protected f...,4,2
4,Indian firms Zika virus vaccine 100% efficient...,4,2


In [24]:
unique_title.cluster.value_counts()

2    450
1    81 
3    64 
4    49 
0    42 
5    38 
6    17 
Name: cluster, dtype: int64

In [20]:
# Show all text in columns

pd.set_option('display.max_colwidth', -1)

In [25]:
unique_title[unique_title['cluster']==6]

Unnamed: 0,title,value,cluster
1,Detroit doctor charged with female genital mutilation ordered to jail,4,6
176,Army charges retired major general with rape against a minor in the 1980s,2,6
198,"In rare fumble, Goldman stuns Wall Street with weak trading",2,6
208,Veterans with PTSD are suing the Army to have their discharges upgraded,2,6
243,Federal judge orders Fort Collins to free the nipple from city regulation,2,6
301,U.S. says Iran complies with nuke deal but orders review on lifting sanctions,2,6
338,Gag order keeps Oregon from telling public about cancer-causing pollutant,1,6
357,Kim Jong-un orders evacuation of Pyongyang,1,6
385,"US Army base on lockdown, sixty four soldiers busted in cocaine drug ring",1,6
400,Naked woman slugs Highland Park officer who tried to remove her from Oak Lawn street,1,6
