In [1]:
%%time

import pandas as pd
import numpy as np
import nltk
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

file_name = 'News_Category_Dataset.csv'

df = pd.read_csv(file_name, parse_dates=['date'])
df = df[~df.headline.isna()] # Cleaning
features = df['headline']
labels = df['category']

Wall time: 5.47 s


#### Feature Engineering:

In [2]:
%%time

split_features = [nltk.word_tokenize(sentence) for sentence in features]

Wall time: 44.5 s


In [3]:
%%time

min_f = 100

word_freq = {}
for sentence in split_features:
    for word in sentence:
        if word not in word_freq:
            word_freq[word] = 1
        else:
            word_freq[word] += 1
word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))
print(f'{len(word_freq)} words in corpus')
words = [word for word, freq in word_freq.items() if freq > min_f]
percentage = len(words)/len(word_freq)*100
print(f'Reduced to {len(words)} keeping those with freq > {min_f} ({percentage:.2f} %)')

74319 words in corpus
Reduced to 2676 keeping those with freq > 100 (3.60 %)
Wall time: 1.49 s


In [4]:
%%time

word_to_idx = defaultdict(lambda:0)
idx = 1
for word in words:
    word_to_idx[word] = idx
    idx += 1

Wall time: 0 ns


In [5]:
num_samples = features.shape[0]
num_cols = len(word_to_idx) + 1

features_vec = np.zeros((num_samples, num_cols), dtype=np.int8)
for sample_idx in range(num_samples):
    for word in split_features[sample_idx]:
        word_idx = word_to_idx[word]
        features_vec[sample_idx][word_idx] = 1

#### Train / Test Split

In [6]:
%%time

X_train, X_test, y_train, y_test = train_test_split(features_vec, labels, test_size=.1, random_state=1)

Wall time: 412 ms


#### Training

In [7]:
%%time

tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

Wall time: 2min 23s


DecisionTreeClassifier(max_depth=5)

#### Evaluation

In [8]:
preds_test = tree.predict(X_test)

In [9]:
def total_accuracy(y_pred, y_true):
    return sum(y_pred == y_true) / len(y_true) * 100

def accuracy_by_group(y_pred, y_true):
    results = {val: 0 for val in set(y_true.values)}
    for i in range(len(y_pred)):        
        label = y_pred[i]

        if y_pred[i] == y_true.values[i]:
            results[label] += 1

    for val, nbr in y_true.value_counts().reset_index().values:
        results[val] /= nbr
        
    results = dict(sorted(results.items(), key=lambda x: x[1], reverse=True))
    
    return results

In [10]:
total_accuracy(preds_test, y_test)

22.36992780682101

In [11]:
accuracy_by_group(preds_test, y_test)

{'POLITICS': 0.9174422095466827,
 'STYLE & BEAUTY': 0.4172661870503597,
 'ENTERTAINMENT': 0.4108574953732264,
 'WEDDINGS': 0.37637362637362637,
 'DIVORCE': 0.27627627627627627,
 'HOME & LIVING': 0.08823529411764706,
 'COMEDY': 0.07283464566929133,
 'FOOD & DRINK': 0.054249547920433995,
 'QUEER VOICES': 0.05,
 'WEIRD NEWS': 0.0,
 'EDUCATION': 0.0,
 'GOOD NEWS': 0.0,
 'WORLDPOST': 0.0,
 'TASTE': 0.0,
 'BUSINESS': 0.0,
 'IMPACT': 0.0,
 'MEDIA': 0.0,
 'MONEY': 0.0,
 'LATINO VOICES': 0.0,
 'WOMEN': 0.0,
 'BLACK VOICES': 0.0,
 'SPORTS': 0.0,
 'ENVIRONMENT': 0.0,
 'STYLE': 0.0,
 'WELLNESS': 0.0,
 'THE WORLDPOST': 0.0,
 'HEALTHY LIVING': 0.0,
 'CRIME': 0.0,
 'ARTS & CULTURE': 0.0,
 'TECH': 0.0,
 'TRAVEL': 0.0,
 'FIFTY': 0.0,
 'ARTS': 0.0,
 'WORLD NEWS': 0.0,
 'PARENTING': 0.0,
 'SCIENCE': 0.0,
 'GREEN': 0.0,
 'RELIGION': 0.0,
 'PARENTS': 0.0,
 'CULTURE & ARTS': 0.0,
 'COLLEGE': 0.0}

In [12]:
y_train.value_counts()

POLITICS          29407
WELLNESS          16081
ENTERTAINMENT     14437
TRAVEL             8929
STYLE & BEAUTY     8676
PARENTING          7812
HEALTHY LIVING     6027
FOOD & DRINK       5673
QUEER VOICES       5653
BUSINESS           5329
COMEDY             4667
SPORTS             4400
BLACK VOICES       4061
HOME & LIVING      3787
PARENTS            3578
WEDDINGS           3287
THE WORLDPOST      3252
WOMEN              3155
IMPACT             3109
DIVORCE            3093
CRIME              3083
MEDIA              2501
WEIRD NEWS         2400
GREEN              2327
WORLDPOST          2317
RELIGION           2307
STYLE              2048
SCIENCE            1978
WORLD NEWS         1957
TASTE              1883
TECH               1879
MONEY              1529
ARTS               1357
GOOD NEWS          1260
FIFTY              1255
ENVIRONMENT        1182
ARTS & CULTURE     1178
COLLEGE            1037
LATINO VOICES      1014
CULTURE & ARTS      950
EDUCATION           907
Name: category, 