# News article classification

In [35]:
%matplotlib inline

from pathlib import Path

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [36]:
DATA_DIR = Path('data')

In [37]:
path = DATA_DIR
files = sorted(list(path.glob('**/*.txt')))
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]])
    doc_list.append([topic, heading, body])

In [38]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    2 non-null      object
 1   heading  2 non-null      object
 2   body     2 non-null      object
dtypes: object(3)
memory usage: 176.0+ bytes


### Create stratified train-test split

In [39]:
y = pd.factorize(docs.topic)[0]
X = docs.body
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

### Vectorize text data

In [40]:
vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

In [41]:
X_train_dtm.shape, X_test_dtm.shape

((1, 1925), (1, 1925))

### Train Multi-class Naive Bayes model

In [42]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

### Evaluate Results

#### Accuracy

In [43]:
accuracy_score(y_test, y_pred_class)

1.0

#### Confusion matrix

In [44]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred_class))

Unnamed: 0,0
0,1
