# nltk NLP classifier model training
Import project dependencies for model training

In [19]:
from structures import Text, Tag
from training import Trainer

Graphing dependencies.

In [20]:
import seaborn as sb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

Select the train corpus and the output folder.

In [21]:
corpus = "TextClassification/dataset/newsg/train"
out = "TextClassification/out"

In [22]:
import os
init_dir = os.getcwd()
os.chdir(corpus)

trainer = Trainer('english')
tags = os.listdir()

Load tags into Trainer class

In [23]:
for t in tags:
    os.chdir(t)

    tag = Tag(t)
    trainer.addTag(tag)

    for file in os.listdir():
        with open(file, 'r', errors='ignore') as fd:
            trainer.addText(Text(fd.read(), tag))

    os.chdir('..')

Start training (this will take a while)

In [24]:
trainer.train()

Returning to root directoy and saving tags features.

In [25]:
os.chdir(init_dir)
os.chdir(out)

import pickle
for tag in trainer.tags:
    with open(tag.name+'.tag', 'bw') as dump_fd:
        pickle.dump(tag, dump_fd)

Comparing test corpus against trained model, extracting confusion matrix.

In [26]:
from TextClassification.model import Model
tags = trainer.tags
model = Model(tags)

Select test corpus path and load files.

In [27]:
corpus = "TextClassification/dataset/newsg/test"

In [28]:
cm = Counter()

os.chdir(init_dir)
os.chdir(corpus)

for tag in tags:
    os.chdir(tag.name)
    
    for file in os.listdir():
        with open(file, 'r', errors='ignore') as fd:
            text = Text(fd.read())
        model.classify(text)
        cm[(tag.name, text.tag.name)] += 1
    
    os.chdir('..')
    
os.chdir(init_dir)

Display heatmap of confusion matrix

In [29]:
cm_array = np.array([[cm[(t.name, ti.name)] for ti in tags] for t in tags])

df_cm = pd.DataFrame(cm_array, index = [t.name for t in tags],
                     columns = [t.name for t in tags])
plt.figure(figsize = (15,10))
sb.heatmap(df_cm, annot=True)
plt.show()