In [27]:
!pip install xgboost --quiet
!pip install keras --quiet

In [28]:
import os
import glob
import pandas as pd
import numpy as np
import operator 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
def check_missing_vals(dframe) -> list:
    return sorted([(c, dframe[c].dtype, dframe[c].isnull().sum()) 
                        for c in dframe.columns if dframe[c].isnull().sum() > 0], 
                      key=lambda x: x[1], reverse=True
                )

def metric_report(yTest, yPred):
    print(f'accuracy: {accuracy_score(yTest, yPred)}')
    print('')
    print("Classification report: \n\n", metrics.classification_report(yTest, yPred))
    print('')
    confusion_matrix = metrics.confusion_matrix(yTest, yPred)
    print("Confusion matrix: \n\n", confusion_matrix)

    
def clean_txt(txtCol: pd.Series) -> pd.Series:
    sw = set(stopwords.words('English'))

    text = txtCol.apply(gensim.utils.simple_preprocess, min_len=3)
    text = text.apply(lambda s: [w for w in s if w not in sw]) 
    text = text.apply(lambda s: [SnowballStemmer("english", ignore_stopwords=True).stem(w) for w in s])
    text = text.apply(lambda s: ['_'.join(x) for x in nltk.bigrams(s)] + s)
    
    return text

In [30]:
def get_data(path, exclude) -> pd.DataFrame:
    """
    get all txt data, put in list of dicts and return a dataframe
    """
    data = []

    for file in os.listdir(path):
        if file not in (exclude):
            full_filename = os.path.join(path, file)
            for news in os.listdir(full_filename):
                with open(os.path.join(full_filename, news), 'rb') as txt_file:
                    data.append({'NewsText': txt_file.read(), 'NewsType': file})

    return pd.DataFrame(data)

import pickle

def picklefy(txtCol: pd.DataFrame) -> pd.DataFrame:
    if [f for f in os.listdir('.') if f.endswith('p')]:
        return pickle.load(open('tfidf.p','rb'))
    return pickle.dump(clean_txt(txtCol['NewsText']), open('tfidf.p', 'wb')) 

In [31]:
# preemtively drop duplicates
df = get_data('bbc/', 'README.TXT').drop_duplicates(); df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Ad sales boost Time Warner profit\r\n\r\nQua...,business
1,b'Dollar gains on Greenspan speech\r\n\r\nThe ...,business


In [32]:
df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Ad sales boost Time Warner profit\r\n\r\nQua...,business
1,b'Dollar gains on Greenspan speech\r\n\r\nThe ...,business


In [33]:
# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        encoding='latin-1', 
                        #ngram_range=(1, 2),
                        stop_words='english')

In [34]:
#cleanTxtCol = picklefy(df).apply(lambda x: ' '.join(i for i in x))

features = tfidf.fit_transform(df.NewsText).toarray() 
labels = df.NewsType 

In [35]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

X = features
y = LE.fit_transform(labels)

# train/test/split
X_train, X_test, y_train, y_test = train_test_split(
                                            X,y, test_size=0.2
                                    )

In [36]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(max_leaf_nodes=7, n_estimators=50, random_state=1)

In [37]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.8544600938967136

Classification report: 

               precision    recall  f1-score   support

           0       0.82      0.94      0.88       107
           1       0.96      0.58      0.72        78
           2       0.96      0.94      0.95        71
           3       0.75      1.00      0.85       103
           4       1.00      0.72      0.83        67

    accuracy                           0.85       426
   macro avg       0.90      0.84      0.85       426
weighted avg       0.88      0.85      0.85       426


Confusion matrix: 

 [[101   0   1   5   0]
 [ 13  45   2  18   0]
 [  0   0  67   4   0]
 [  0   0   0 103   0]
 [  9   2   0   8  48]]


In [40]:
from sklearn.model_selection import GridSearchCV
params = { 
              'criterion':['gini','entropy'],
              'max_features':['auto', 'sqrt'],
              'max_depth': np.arange(3,15),
              'max_leaf_nodes': np.arange(3, 15),
        }
rfc_f = RandomForestClassifier()
gs_rfc_f = GridSearchCV(clf, params , cv=5)
gs_rfc_f.fit(X_train, y_train)
gs_rfc_f.best_params_

{'criterion': 'entropy',
 'max_depth': 13,
 'max_features': 'auto',
 'max_leaf_nodes': 14}

In [41]:
rfc_f = RandomForestClassifier(n_estimators=100,**gs_rfc_f.best_params_)

In [42]:
rfc_f.fit(X_train, y_train)
y_pred = rfc_f.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.9272300469483568

Classification report: 

               precision    recall  f1-score   support

           0       0.90      0.95      0.93       107
           1       0.92      0.88      0.90        78
           2       0.97      0.94      0.96        71
           3       0.90      0.98      0.94       103
           4       0.98      0.84      0.90        67

    accuracy                           0.93       426
   macro avg       0.94      0.92      0.93       426
weighted avg       0.93      0.93      0.93       426


Confusion matrix: 

 [[102   1   1   2   1]
 [  6  69   0   3   0]
 [  2   0  67   2   0]
 [  1   0   1 101   0]
 [  2   5   0   4  56]]
