# Text Classification

Using SVM

In [1]:
import pickle
from pathlib import Path
import os
from collections import Counter

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from nlp.cleaner import Cleaner
from nlp.tfidfer import TFIDFer

### Loading articles

In [2]:
STORAGE_PATH = './storage/nb.model'

good_articles = pd.read_table('./articles/good.articles', sep='\n', names=['body'])
good_articles['label'] = 0 # good
bad_articles  = pd.read_table('./articles/bad.articles', sep='\n', names=['body'])
bad_articles['label'] = 1 # bad

# joining
articles = pd.concat([good_articles, bad_articles], ignore_index=True)

# persistently shuffling
index_path = f"{STORAGE_PATH}.shuffled_index"
if Path(index_path).is_file():
  file_size = os.path.getsize(index_path)
  print(f"Loading index from stored file ({file_size} bytes)...")
  with open(index_path, 'rb') as fp:
    shuffled_index = pickle.load(fp)
else:
  print('Shuffling index for the first time ...')
  shuffled_index = np.random.permutation(articles.index)
  print('Saving index on disk for further access...')
  with open(index_path, 'wb') as fp:
    pickle.dump(shuffled_index, fp)
  file_size = os.path.getsize(index_path)
  print(f"Done. It took {file_size} bytes on the disk.")

articles = articles.reindex(shuffled_index)

print(f"Counts:\n{articles['label'].value_counts()}")
articles.head()

Loading index from stored file (113239 bytes)...
Counts:
1    7518
0    6617
Name: label, dtype: int64


Unnamed: 0,body,label
10624,Во Франции цементный концерн подозревают в спо...,1
10193,Полиция Ирландии арестовала двух человек после...,1
2894,"Российские медицинские туристы, которые ездили...",0
9087,Следователь московского полицейского главка вы...,1
7888,МЧС предупредило москвичей об ухудшении погоды...,1


### Splitting between train and test

In [3]:
train_data, test_data = train_test_split(articles, train_size = 0.8)

print(f"Train size: {train_data.shape}")
print(f"Test size: {test_data.shape}")

Train size: (11308, 2)
Test size: (2827, 2)


### Build dictionary and compute features

In [5]:
vocabulary_size = 2000
cleaner = Cleaner()

def build_documents_as_words(documents, cleaner = cleaner):
  return [cleaner.words(document) for document in documents['body']]

def build_dictionary(documents_as_words, vocabulary_size):
  words = [word for words in documents_as_words for word in words]
  count = Counter(words).most_common(vocabulary_size)
  dictionary = dict()
  for word, _ in count: dictionary[word] = len(dictionary)
  return dictionary, dict(zip(dictionary.values(), dictionary.keys()))

# building
documents_as_words = build_documents_as_words(articles)
dictionary, reverse_dictionary = build_dictionary(documents_as_words, vocabulary_size)
tfidfer = TFIDFer(dictionary, reverse_dictionary)
tf = tfidfer.compute_tf(documents_as_words)

feature = tf

### Tokenizing for training and testing

In [8]:
def tokenized(text, cleaner = cleaner, dictionary = dictionary):
  return [dictionary[word] for word in cleaner.words(text) if word in dictionary]

def sparsify(dense, vocabulary_size = vocabulary_size):
  sparse = np.zeros(vocabulary_size)
  for key in dense: sparse[key] = dense[key]
  return sparse

train_data['sparse_body'] = train_data['body'].apply(lambda body: sparsify(Counter(tokenized(body))))
test_data['sparse_body']  = test_data['body'].apply(lambda body: sparsify(Counter(tokenized(body))))

print(train_data['sparse_body'].head())
print(train_data['label'].head())

10925    [6.0, 4.0, 3.0, 2.0, 1.0, 5.0, 1.0, 1.0, 0.0, ...
5183     [110.0, 62.0, 59.0, 15.0, 40.0, 23.0, 14.0, 9....
9870     [5.0, 5.0, 6.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, ...
13631    [6.0, 10.0, 7.0, 4.0, 0.0, 0.0, 0.0, 0.0, 1.0,...
9255     [8.0, 5.0, 2.0, 3.0, 2.0, 2.0, 1.0, 1.0, 0.0, ...
Name: sparse_body, dtype: object
10925    1
5183     0
9870     1
13631    1
9255     1
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


### Training

In [7]:
clf = SVC()
clf.fit(list(train_data['sparse_body']), list(train_data['label']))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Predicting

In [9]:
predictions = clf.predict(list(test_data['sparse_body']))

report = metrics.classification_report(test_data['label'], predictions)
print(report)

             precision    recall  f1-score   support

          0       0.98      0.95      0.97      1312
          1       0.96      0.99      0.97      1515

avg / total       0.97      0.97      0.97      2827



### Results
