# Data Preprocess

In [1]:
import json
with open('../datas/bbc_preprocessed.json') as f:
    datas = json.load(f)

In [2]:
import random
random.seed(5220)
random.shuffle(datas)

In [3]:
cate = [data['category'] for data in datas]
mp = {'Technology':0,'Entertainment & Arts':1,'Business':2,'Health':3,'Science & Environment':4}
inv_mp = {v:k for k, v in mp.items()}
cat = [mp[x] for x in cate]
cont = [' '.join(data['content']) for data in datas]

## Devide into train / valid / test set

In [4]:
from sklearn.model_selection import train_test_split

X_tv, X_test, y_tv, y_test = train_test_split(
    cont, cat, test_size=0.3, random_state=5220)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_tv, y_tv, test_size=0.5, random_state=5220)

# Models

## Naive Bayes without tfidf

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

nbw_clf = Pipeline([('vect', CountVectorizer()),
                      ('clf', MultinomialNB()),
])

In [25]:
nbw_clf.fit(X_train, y_train)
nbw_valid_predicts = nbw_clf.predict(X_valid)

In [26]:
from sklearn.metrics import confusion_matrix

nbw_cm = confusion_matrix(y_valid, nbw_valid_predicts)
nbw_cm

array([[1538,   11,   55,    2,   27],
       [  32,  960,   33,    0,    0],
       [ 246,   24, 1931,   11,   23],
       [   7,    0,    7,  263,    3],
       [   8,    4,   27,   10,  365]], dtype=int64)

## Naive Bayes

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

nb_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

In [6]:
nb_clf.fit(X_train, y_train)
nb_valid_predicts = nb_clf.predict(X_valid)

In [7]:
from sklearn.metrics import confusion_matrix

nb_cm = confusion_matrix(y_valid, nb_valid_predicts)
nb_cm

array([[1415,    0,  218,    0,    0],
       [  46,  842,  137,    0,    0],
       [ 142,    1, 2091,    1,    0],
       [  44,    0,  219,   17,    0],
       [  74,    2,  284,    0,   54]], dtype=int64)

## SVM

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

svm_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(tol=1e-3)),
])

In [23]:
svm_clf.fit(X_train, y_train)
svm_valid_predicts = svm_clf.predict(X_valid)

In [10]:
from sklearn.metrics import confusion_matrix

svm_cm = confusion_matrix(y_valid, svm_valid_predicts)
svm_cm

array([[1527,   14,   76,    4,   12],
       [   8, 1004,   13,    0,    0],
       [ 140,   19, 2052,   10,   14],
       [   5,    0,    4,  266,    5],
       [  13,    3,   14,    8,  376]], dtype=int64)

In [73]:
vocab = tfiVect.vocabulary_
rev_vocab = {v:k for k,v in vocab.items()}
indexes = coef[2].argsort()[::-1][:100]
print([rev_vocab[i] for i in indexes])
print(sorted(coef[4])[::-1][:10])
print(len(vocab))

['busi', 'bank', 'chief', 'compani', 'quarter', 'deal', 'economi', 'trade', 'mr', 'share', 'enabl', 'econom', 'big', 'advertis', 'sharehold', 'argu', 'manag', 'davo', 'forecast', 'competit', 'tax', 'boe', 'founder', 'rise', 'execut', 'plant', 'insur', 'person', 'transpar', 'profit', 'singapor', 'corpor', 'openreach', 'verizon', 'global', 'obr', 'fire', 'rose', 'iot', 'pension', 'sfo', 'food', 'credit', 'interact', 'debt', 'switch', 'penalti', 'student', 'jump', 'interdigit', 'banknot', 'foreign', 'ban', '1bn', 'oil', 'buy', 'farmer', 'suspend', 'ms', 'cabl', 'class', 'rent', 'buyer', 'month', 'car', 'tuesday', 'strong', 'worker', 'vw', 'regul', 'so', 'tradit', 'don', 'rb', 'fall', 'carmak', 'ski', 'hmrc', 'supermarket', 'discount', 'reader', 'work', 'earlier', 'york', 'html5', 'ashley', 'malvern', 'infring', 'board', 'monthli', 'porn', 'giant', 'custom', 'august', 'holiday', 'toy', 'save', 'edf', 'bp', 'cuba']
[2.4331771592378799, 2.4138683809985273, 2.2274573302851985, 2.0212643524676

## Multinomial Logistic Regression

In [11]:
## To be continued

# Evaluation

## Number of classes in training and validation set

In [12]:
from collections import Counter

counter_y_train = Counter(y_train)
counter_y_valid = Counter(y_valid)

def format1(s1, s2, s3):
    return "{:22} {:>8} {:>10}".format(s1, s2, s3)

print(format1("Categories", "Training", "Validation"))
for k,v in mp.items():
    print(format1(k, counter_y_train[v], counter_y_valid[v]))

Categories             Training Validation
Technology                 1656       1633
Entertainment & Arts        973       1025
Business                   2279       2235
Health                      285        280
Science & Environment       394        414


In [13]:
def format2(s1, s2, s3):
    return "{:22} {:8.2f}% {:9.2f}%".format(s1, s2, s3)

print(format1("Categories", "Training", "Validation"))
for k,v in mp.items():
    print(format2(k, 100*counter_y_train[v]/sum([v for k,v in counter_y_train.items()]), 100*counter_y_valid[v]/sum([v for k,v in counter_y_valid.items()])))

Categories             Training Validation
Technology                29.64%     29.23%
Entertainment & Arts      17.42%     18.35%
Business                  40.79%     40.00%
Health                     5.10%      5.01%
Science & Environment      7.05%      7.41%


## Effectiveness for each model

### Naive Bayes without tfidf

In [27]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
nbw_precisions, nbw_recalls, nbw_f1s, _ = precision_recall_fscore_support(y_valid, nbw_valid_predicts)
print(classification_report(y_valid, nbw_valid_predicts))

             precision    recall  f1-score   support

          0       0.84      0.94      0.89      1633
          1       0.96      0.94      0.95      1025
          2       0.94      0.86      0.90      2235
          3       0.92      0.94      0.93       280
          4       0.87      0.88      0.88       414

avg / total       0.91      0.91      0.91      5587



### Naive Bayes

In [18]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
nb_precisions, nb_recalls, nb_f1s, _ = precision_recall_fscore_support(y_valid, nb_valid_predicts)
print(classification_report(y_valid, nb_valid_predicts))

             precision    recall  f1-score   support

          0       0.82      0.87      0.84      1633
          1       1.00      0.82      0.90      1025
          2       0.71      0.94      0.81      2235
          3       0.94      0.06      0.11       280
          4       1.00      0.13      0.23       414

avg / total       0.83      0.79      0.76      5587



### SVM

In [19]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
svm_precisions, svm_recalls, svm_f1s, _ = precision_recall_fscore_support(y_valid, svm_valid_predicts)
print(classification_report(y_valid, svm_valid_predicts))

             precision    recall  f1-score   support

          0       0.90      0.94      0.92      1633
          1       0.97      0.98      0.97      1025
          2       0.95      0.92      0.93      2235
          3       0.92      0.95      0.94       280
          4       0.92      0.91      0.92       414

avg / total       0.94      0.94      0.94      5587



### Multinomial Logistic Regression

In [16]:
## To be continued