In [1]:
import os

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from machine_learning.utils.utils_nlp import remove_email, remove_newline_char, remove_single_quote
from machine_learning.utils.utils_io import read_lines_from_text_file

In [None]:
data_nlp_dirpath = "../data/nlp"

In [2]:
stopwords_filename = "stopwords_english.txt"
stopwords_filepath = os.path.join(data_nlp_dirpath, stopwords_filename)
stopwords = read_lines_from_text_file(stopwords_filepath)

### data

In [3]:
# get newsgroups.json from https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json
# it is not included in this package to save 22.2MB space

newgroups_filename = "newsgroups.json"
newgroups_filepath = os.path.join(data_nlp_dirpath, newgroups_filename)
df = pd.read_json(newgroups_filepath)
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [4]:
docs = df.content.values.tolist()
targets = df.target.values.tolist()

### preprocessing

In [5]:
docs = [remove_email(doc) for doc in docs]

In [6]:
docs = [remove_newline_char(doc) for doc in docs]

In [7]:
docs = [remove_single_quote(doc) for doc in docs]

In [8]:
doc_train, doc_test, target_train, target_test = train_test_split(docs, targets, test_size=0.3, random_state=0)

### vectorization

In [9]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X_train = vectorizer.fit_transform(doc_train)
X_train.shape

(7919, 88666)

In [10]:
X_test = vectorizer.transform(doc_test)
X_test.shape

(3395, 88666)

### model

In [11]:
model = MultinomialNB(alpha=.01)
model.fit(X_train, target_train)

MultinomialNB(alpha=0.01)

### prediction

In [12]:
target_test_pred = model.predict(X_test)

In [13]:
target_test[:10]

[1, 12, 13, 14, 9, 9, 11, 8, 14, 11]

In [14]:
target_test_pred[:10]

array([ 1, 12, 13, 14,  9, 10, 11,  8, 14, 11])

In [15]:
score = f1_score(target_test, target_test_pred, average='macro')
score

0.9093606110443136