In [23]:
!pip install xgboost --quiet
!pip install keras --quiet

In [150]:
import os
import glob
import pandas as pd
import numpy as np
import operator 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [266]:
def check_missing_vals(dframe) -> list:
    return sorted([(c, dframe[c].dtype, dframe[c].isnull().sum()) 
                        for c in dframe.columns if dframe[c].isnull().sum() > 0], 
                      key=lambda x: x[1], reverse=True
                )

def metric_report(yTest, yPred):
    print(f'accuracy: {accuracy_score(yTest, yPred)}')
    print('')
    print("Classification report: \n\n", metrics.classification_report(yTest, yPred))
    print('')
    confusion_matrix = metrics.confusion_matrix(yTest, yPred)
    print("Confusion matrix: \n\n", confusion_matrix)

    
def clean_txt(txtCol: pd.Series) -> pd.Series:
    sw = set(stopwords.words('English'))

    text = txtCol.apply(gensim.utils.simple_preprocess, min_len=3)
    text = text.apply(lambda s: [w for w in s if w not in sw]) 
    text = text.apply(lambda s: [SnowballStemmer("english", ignore_stopwords=True).stem(w) for w in s])
    text = text.apply(lambda s: ['_'.join(x) for x in nltk.bigrams(s)] + s)
    
    return text

In [267]:
def get_data(path, exclude) -> pd.DataFrame:
    """
    get all txt data, put in list of dicts and return a dataframe
    """
    data = []

    for file in os.listdir(path):
        if file not in (exclude):
            full_filename = os.path.join(path, file)
            for news in os.listdir(full_filename):
                with open(os.path.join(full_filename, news), 'rb') as txt_file:
                    data.append({'NewsText': txt_file.read(), 'NewsType': file})

    return pd.DataFrame(data)

import pickle

def picklefy(txtCol: pd.DataFrame) -> pd.DataFrame:
    if [f for f in os.listdir('.') if f.endswith('p')]:
        return pickle.load(open('tfidf.p','rb'))
    return pickle.dump(clean_txt(txtCol['NewsText']), open('tfidf.p', 'wb')) 

In [268]:
# preemtively drop duplicates
df = get_data('bbc/', 'README.TXT').drop_duplicates(); df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Musicians to tackle US red tape\n\nMusicians...,entertainment
1,"b'U2\'s desire to be number one\n\nU2, who hav...",entertainment


In [269]:
df.describe()

Unnamed: 0,NewsText,NewsType
count,2127,2127
unique,2127,5
top,b'Musicians to tackle US red tape\n\nMusicians...,sport
freq,1,505


In [270]:
# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        encoding='latin-1', 
                        #ngram_range=(1, 2),
                        stop_words='english')

In [271]:
cleanTxtCol = picklefy(df).apply(lambda x: ' '.join(i for i in x))

features = tfidf.fit_transform(cleanTxtCol).toarray() 
labels = df.NewsType 

In [272]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

X = features
y = LE.fit_transform(labels)

# train/test/split
X_train, X_test, y_train, y_test = train_test_split(
                                            X,y, test_size=0.2
                                    )

In [273]:
# decision tree

from sklearn import tree

clf = tree.DecisionTreeClassifier()

In [274]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.8544600938967136

Classification report: 

               precision    recall  f1-score   support

           0       0.82      0.87      0.84        98
           1       0.75      0.82      0.78        55
           2       0.89      0.83      0.86        94
           3       0.93      0.91      0.92       101
           4       0.85      0.82      0.84        78

    accuracy                           0.85       426
   macro avg       0.85      0.85      0.85       426
weighted avg       0.86      0.85      0.86       426


Confusion matrix: 

 [[85  5  2  0  6]
 [ 1 45  4  3  2]
 [12  0 78  2  2]
 [ 1  5  2 92  1]
 [ 5  5  2  2 64]]


In [275]:
import xgboost as xgb
from xgboost import XGBClassifier
xg_clf = XGBClassifier(random_state=42, use_label_encoder=False)

In [276]:
xgb_clf = xg_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.9389671361502347

Classification report: 

               precision    recall  f1-score   support

           0       0.89      0.93      0.91        98
           1       0.93      0.93      0.93        55
           2       0.96      0.95      0.95        94
           3       0.98      0.99      0.99       101
           4       0.93      0.88      0.91        78

    accuracy                           0.94       426
   macro avg       0.94      0.94      0.94       426
weighted avg       0.94      0.94      0.94       426


Confusion matrix: 

 [[ 91   3   3   0   1]
 [  1  51   0   1   2]
 [  3   0  89   0   2]
 [  1   0   0 100   0]
 [  6   1   1   1  69]]


In [None]:
from xgboost import cv

params = {"objective":"binary:logistic",
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5, 
          'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, 
            params=params, 
            nfold=5,
            num_boost_round=50,
            early_stopping_rounds=10, 
            metrics="auc", 
            as_pandas=True, 
            seed=123)