In [1]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.stem import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


## Task 1:

In [2]:
news_categorizer = pd.read_csv('NewsCategorizer.csv')

In [3]:
print(news_categorizer.head())

   category                                           headline  \
0  WELLNESS              143 Miles in 35 Days: Lessons Learned   
1  WELLNESS       Talking to Yourself: Crazy or Crazy Helpful?   
2  WELLNESS  Crenezumab: Trial Will Gauge Whether Alzheimer...   
3  WELLNESS                     Oh, What a Difference She Made   
4  WELLNESS                                   Green Superfoods   

                                               links  \
0  https://www.huffingtonpost.com/entry/running-l...   
1  https://www.huffingtonpost.com/entry/talking-t...   
2  https://www.huffingtonpost.com/entry/crenezuma...   
3  https://www.huffingtonpost.com/entry/meaningfu...   
4  https://www.huffingtonpost.com/entry/green-sup...   

                                   short_description  \
0  Resting is part of training. I've confirmed wh...   
1  Think of talking to yourself as a tool to coac...   
2  The clock is ticking for the United States to ...   
3  If you want to be busy, keep trying to 

In [4]:
data = news_categorizer[['category','short_description']].copy()

## Task 3:

In [5]:
short_description = data['short_description'].to_list()
short_description_preprocessed = []

for k in np.arange(len(short_description)):
    text= ""
    for i in np.arange(len(short_description[k])):
        if short_description[k][i].isalpha() or short_description[k][i].isspace():
            text += short_description[k][i].lower()
    text_split = text.split()
    short_description_preprocessed.append(text_split)


In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
lammatized_descriptions = []

for i in short_description_preprocessed:
    lammatized_description = []
    for k in i:
        lemmatized_token = lemmatizer.lemmatize(k)
        lammatized_description.append(lemmatized_token)
    lammatized_descriptions.append(lammatized_description)

In [8]:
stop_words = list(set(stopwords.words('english')))

In [9]:
preprocessed_stop_words = []

for k in np.arange(len(stop_words)):
    text= ""
    for i in np.arange(len(stop_words[k])):
        if stop_words[k][i].isalpha() or stop_words[k][i].isspace():
            text += stop_words[k][i].lower()
            lemmatized_text = lemmatizer.lemmatize(text)
    preprocessed_stop_words.append(lemmatized_text)

In [10]:
filtered_descriptions = []

for i in np.arange(len(lammatized_descriptions)):
    filtered_description = []
    for k in lammatized_descriptions[i]:
        if k not in preprocessed_stop_words:
            filtered_description.append(k)
    filtered_descriptions.append(filtered_description)

In [11]:
proprocessed_descriptions=[]

for i in filtered_descriptions:
    joined = ' '.join(i)
    proprocessed_descriptions.append(joined)

In [12]:
print(len(proprocessed_descriptions),proprocessed_descriptions[:10])

50000 ['resting part training ive confirmed sort already knew im built running streak im built hard workout three five day week lot cross training physical therapy foam rolling ive also confirmed im stubborn', 'think talking tool coach challenge narrate experience case treat respect may find enjoy company', 'clock ticking united state find cure team working study dr francisco lopera', 'want busy keep trying perfect want happy focus making difference', 'first bad news soda bread corned beef beer highly nutritious meal make luck irish would', 'carey moss youbeautycom love romcoms love song breakup song lesson teach u might', 'nation general scored scale little bit score', 'also worth remembering water seaweed come contaminated say toxic metal arsenic', 'look culture eating behavior certainly look like addiction situation opportunity eat weve started eating seem know stop even want even know hurting', 'françoismarie arouet th century french author iconoclast better known voltaire quipped 

## Task 3:

In [13]:
vectorizer = TfidfVectorizer()

In [14]:
X = vectorizer.fit_transform(proprocessed_descriptions)

## Task 4:

In [15]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)

In [16]:
kmeans_labels = kmeans.labels_

In [17]:
data['clustered_labels'] = kmeans_labels

In [18]:
print(data.head())

   category                                  short_description  \
0  WELLNESS  Resting is part of training. I've confirmed wh...   
1  WELLNESS  Think of talking to yourself as a tool to coac...   
2  WELLNESS  The clock is ticking for the United States to ...   
3  WELLNESS  If you want to be busy, keep trying to be perf...   
4  WELLNESS  First, the bad news: Soda bread, corned beef a...   

   clustered_labels  
0                 7  
1                 0  
2                 0  
3                 0  
4                 2  


In [43]:
results = data.groupby(['category','clustered_labels']).count()

In [47]:
print(results.tail(50))

                                 short_description
category       clustered_labels                   
POLITICS       5                                85
               6                               159
               7                               117
               9                               371
SPORTS         0                               467
               1                               146
               2                               323
               3                              3575
               4                                93
               5                                43
               6                               186
               7                                74
               9                                93
STYLE & BEAUTY 0                               342
               1                               344
               2                               302
               3                              3165
               4               

The formed clusters match the category not very good. If we group the clusters by category, it shows that for each category cluster 3 contains the most observations. In the ideal case we would observe that for each category one cluster contains the most observations.