# Data Preprocess

In [135]:
import json
with open('../datas/bbc_preprocessed.json') as f:
    datas = json.load(f)

In [136]:
import random
random.seed(5220)
random.shuffle(datas)

In [137]:
cate = [data['category'] for data in datas]
mp = {'Technology':0,'Entertainment & Arts':1,'Business':2,'Health':3,'Science & Environment':4}
inv_mp = {v:k for k, v in mp.items()}
cat = [mp[x] for x in cate]
cont = [' '.join(data['content']) for data in datas]

## Devide into train / valid / test set

In [138]:
from sklearn.model_selection import train_test_split

X_tv, X_test, y_tv, y_test = train_test_split(
    cont, cat, test_size=0.3, random_state=5220)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_tv, y_tv, test_size=0.5, random_state=5220)

# Models

## Naive Bayes

In [139]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

nb_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])

In [140]:
nb_clf.fit(X_train, y_train)
nb_valid_predicts = nb_clf.predict(X_valid)

In [141]:
from sklearn.metrics import confusion_matrix

nb_cm = confusion_matrix(y_valid, nb_valid_predicts)
nb_cm

array([[1415,    0,  218,    0,    0],
       [  46,  842,  137,    0,    0],
       [ 142,    1, 2091,    1,    0],
       [  44,    0,  219,   17,    0],
       [  74,    2,  284,    0,   54]], dtype=int64)

## SVM

In [142]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

svm_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(tol=1e-3)),
])

In [143]:
svm_clf.fit(X_train, y_train)
svm_valid_predicts = svm_clf.predict(X_valid)

In [144]:
from sklearn.metrics import confusion_matrix

svm_cm = confusion_matrix(y_valid, svm_valid_predicts)
svm_cm

array([[1528,   14,   74,    4,   13],
       [  10, 1001,   10,    0,    4],
       [ 147,   20, 2044,   10,   14],
       [   5,    0,    3,  266,    6],
       [  13,    3,   15,    8,  375]], dtype=int64)

## Multinomial Logistic Regression

In [145]:
## To be continued

# Evaluation

## Number of classes in training and validation set

In [165]:
from collections import Counter

counter_y_train = Counter(y_train)
counter_y_valid = Counter(y_valid)

def format1(s1, s2, s3):
    return "{:22} {:>8} {:>10}".format(s1, s2, s3)

print(format1("Categories", "Training", "Validation"))
for k,v in mp.items():
    print(format1(k, counter_y_train[v], counter_y_valid[v]))

Categories             Training Validation
Technology                 1656       1633
Entertainment & Arts        973       1025
Business                   2279       2235
Health                      285        280
Science & Environment       394        414


In [167]:
def format2(s1, s2, s3):
    return "{:22} {:8.2f}% {:9.2f}%".format(s1, s2, s3)

print(format1("Categories", "Training", "Validation"))
for k,v in mp.items():
    print(format2(k, 100*counter_y_train[v]/sum([v for k,v in counter_y_train.items()]), 100*counter_y_valid[v]/sum([v for k,v in counter_y_valid.items()])))

Categories             Training Validation
Technology                29.64%     29.23%
Entertainment & Arts      17.42%     18.35%
Business                  40.79%     40.00%
Health                     5.10%      5.01%
Science & Environment      7.05%      7.41%


## Effectiveness for each model

### Naive Bayes

In [147]:
from sklearn.metrics import precision_recall_fscore_support
nb_precisions, nb_recalls, nb_f1s, _ = precision_recall_fscore_support(y_valid, nb_valid_predicts)

### SVM

In [148]:
from sklearn.metrics import precision_recall_fscore_support
svm_precisions, svm_recalls, svm_f1s, _ = precision_recall_fscore_support(y_valid, svm_valid_predicts)

### Multinomial Logistic Regression

In [149]:
## To be continued