# Text Classification

Using Naive Bayes

In [1]:
import pickle
from pathlib import Path
import os

import pandas as pd
import numpy as np
from sklearn import metrics

from nlp.nb import NBClassifier

### Loading articles

In [2]:
STORAGE_PATH = './storage/nb.model'

good_articles = pd.read_table('./articles/good.articles', sep='\n', names=['body'])
good_articles['label'] = 0 # good
bad_articles  = pd.read_table('./articles/bad.articles', sep='\n', names=['body'])
bad_articles['label'] = 1 # bad

# joining
articles = pd.concat([good_articles, bad_articles], ignore_index=True)

# persistently shuffling
index_path = f"{STORAGE_PATH}.shuffled_index"
if Path(index_path).is_file():
  file_size = os.path.getsize(index_path)
  print(f"Loading index from stored file ({file_size} bytes)...")
  with open(index_path, 'rb') as fp:
    shuffled_index = pickle.load(fp)
else:
  print('Shuffling index for the first time ...')
  shuffled_index = np.random.permutation(articles.index)
  print('Saving index on disk for further access...')
  with open(index_path, 'wb') as fp:
    pickle.dump(shuffled_index, fp)
  file_size = os.path.getsize(index_path)
  print(f"Done. It took {file_size} bytes on the disk.")

articles = articles.reindex(shuffled_index)

print(f"Counts:\n{articles['label'].value_counts()}")
articles.head()

Loading index from stored file (113239 bytes)...
Counts:
1    7518
0    6617
Name: label, dtype: int64


Unnamed: 0,body,label
10624,Во Франции цементный концерн подозревают в спо...,1
10193,Полиция Ирландии арестовала двух человек после...,1
2894,"Российские медицинские туристы, которые ездили...",0
9087,Следователь московского полицейского главка вы...,1
7888,МЧС предупредило москвичей об ухудшении погоды...,1


### Splitting between train and test

In [3]:
test_first_index = int(articles.shape[0] * 0.8)

train_data = articles[:test_first_index]
test_data  = articles[test_first_index:]

print(train_data.shape)
print(test_data.shape)

(11308, 2)
(2827, 2)


### Training

In [4]:
nbcs = []
for strategy in ['tf', 'tfidf']:
  nbc = NBClassifier(strategy = strategy)
  nbc.train(train_data)
  nbcs.append(nbc)

Removing word сообща for [2982, 2597]
Removing word дан for [1473, 1480]
Removing word москв for [1483, 1395]
Removing word дом for [1406, 1178]
Removing word дел for [1140, 1050]
Removing word наход for [918, 970]
Removing word пресс for [980, 805]
Removing word местн for [724, 712]
Removing word информац for [678, 648]
Removing word жител for [570, 620]
Removing word метр for [560, 603]
Removing word агентств for [471, 500]
Removing word суд for [476, 459]
Removing word петербург for [391, 456]
Removing word штат for [406, 430]
Removing word состоян for [396, 437]
Removing word воен for [354, 430]
Removing word чег for [391, 359]
Removing word жил for [340, 359]
Removing word ма for [372, 327]
Removing word аэропорт for [314, 379]
Removing word июн for [303, 369]
Removing word санкт for [353, 299]
Removing word скор for [351, 300]
Removing word безопасн for [339, 305]
Removing word июл for [315, 327]
Removing word станц for [331, 310]
Removing word август for [308, 326]
Removing word

Removing word возмутител for [1, 1]
Removing word кряд for [1, 1]
Removing word плейбо for [1, 1]
Removing word плейб for [1, 1]
Removing word откушен for [1, 1]
Removing word бай for [1, 1]
Removing word инструктирова for [1, 1]
Removing word отмычк for [1, 1]
Removing word курьерск for [1, 1]
Removing word строев for [1, 1]
Removing word картотек for [1, 1]
Removing word землячеств for [1, 1]
Removing word медвежьегорск for [1, 1]
Removing word колесников for [1, 1]
Removing word прокопьевск for [1, 1]
Removing word девятнадцат for [1, 1]
Removing word акта for [1, 1]
Removing word рублевк for [1, 1]
Removing word потапов for [1, 1]
Removing word токсиколог for [1, 1]
Removing word аккурат for [1, 1]
Removing word гвардейск for [1, 1]
Removing word графф for [1, 1]
Removing word толкнул for [1, 1]
Removing word киноактрис for [1, 1]
Removing word магдал for [1, 1]
Removing word кольчуг for [1, 1]
Removing word фунша for [1, 1]
Removing word лежак for [1, 1]
Removing word предстанут f

Removing word слезт for [1, 1]
Removing word развращен for [1, 1]
Removing word переизбира for [1, 1]
Removing word антикоммунистическ for [1, 1]
Removing word афанасьев for [1, 1]
Removing word бессил for [1, 1]
Removing word солнцевск for [1, 1]
Removing word востряковск for [1, 1]
Removing word дестабилизир for [1, 1]
Removing word алиев for [1, 1]
Removing word приятельск for [1, 1]
Removing word рвал for [1, 1]
Removing word непромока for [1, 1]
Removing word гесс for [1, 1]
Removing word москвин for [1, 1]
Removing word метеонаблюден for [1, 1]
Removing word режимн for [1, 1]
Removing word шифрован for [1, 1]
Removing word кай for [1, 1]
Removing word актюбинск for [1, 1]
Removing word боз for [1, 1]
Removing word ип for [1, 1]
Removing word полкилограмм for [1, 1]
Removing word шульг for [1, 1]
Removing word широкомасштабн for [1, 1]
Removing word размет for [1, 1]
Removing word авиаотрасл for [1, 1]
Removing word наткнувш for [1, 1]
Removing word ованнися for [1, 1]
Removing wo

Built IDF


### Predicting

In [5]:
predictions_by_strategy = [[nbc.predict(article).label for article in test_data['body']] for nbc in nbcs]

In [6]:
for predictions in predictions_by_strategy:
  report = metrics.classification_report(test_data['label'], predictions)
  accuracy = metrics.accuracy_score(test_data['label'], predictions)

  report += f"\nAccuracy score: {accuracy:.3f}"
  print(report)

             precision    recall  f1-score   support

          0       0.94      1.00      0.97      1329
          1       1.00      0.95      0.97      1498

avg / total       0.97      0.97      0.97      2827

Accuracy score: 0.971
             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1329
          1       1.00      0.94      0.97      1498

avg / total       0.97      0.96      0.96      2827

Accuracy score: 0.964
