# Objectives
* YWBAT parse xml files
* YWBAT Create a BOW df
* YWBAT Train/Test a MultiNomial Classifier (NB)

In [77]:
import json
import numpy as np
import pandas as pd
import glob

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import PCA
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

In [58]:
files = glob.glob("reuters21578/*.sgm")

In [3]:
reuters_list = []
for file in files:
    print("Opening file - {}".format(file))
    try:
        f = open(file, 'r').read()
        soup = BeautifulSoup(f, "lxml")
        reuters = soup.find_all("reuters")
        for reuter in reuters:
            d = dict()
            topic = reuter.find("topics")
            d["topic"] = topic.text if topic.text is not None else "nothing"
            d["train_test"] = reuter.attrs["lewissplit"].lower()
            date_line = reuter.find("text").find("dateline")
            date_line = date_line.text if date_line else 'dateline'
            title = reuter.find("text").find("title")
            title = title.text if title else "tiitttle"
            d["txt"] = reuter.find("text").text.replace(title, "").replace(date_line, "")
            reuters_list.append(d)
    except:
        pass

Opening file - reuters21578/reut2-004.sgm
Opening file - reuters21578/reut2-010.sgm
Opening file - reuters21578/reut2-011.sgm
Opening file - reuters21578/reut2-005.sgm
Opening file - reuters21578/reut2-013.sgm
Opening file - reuters21578/reut2-007.sgm
Opening file - reuters21578/reut2-006.sgm
Opening file - reuters21578/reut2-012.sgm
Opening file - reuters21578/reut2-016.sgm
Opening file - reuters21578/reut2-002.sgm
Opening file - reuters21578/reut2-003.sgm
Opening file - reuters21578/reut2-017.sgm
Opening file - reuters21578/reut2-001.sgm
Opening file - reuters21578/reut2-015.sgm
Opening file - reuters21578/reut2-014.sgm
Opening file - reuters21578/reut2-000.sgm
Opening file - reuters21578/reut2-019.sgm
Opening file - reuters21578/reut2-018.sgm
Opening file - reuters21578/reut2-020.sgm
Opening file - reuters21578/reut2-008.sgm
Opening file - reuters21578/reut2-009.sgm
Opening file - reuters21578/reut2-021.sgm


In [4]:
df = pd.DataFrame(reuters_list)

In [60]:
df.head()

Unnamed: 0,topic,train_test,txt,topic_category,vec
0,,train,\n\nInco Ltd said it did not expect its\nearli...,0,"[2.7918600967024747, -0.120579673331519]"
1,,train,"\n\nMason Benson, former president and\nchief ...",0,"[-2.2637614266081556, -1.1942673321327046]"
2,,train,\n\nIn a discovery that could complicate the\n...,0,"[-0.14117471403925372, -1.9846415052029434]"
3,,train,\n\nDoctors at the Centers for Disease\nContro...,0,"[0.19896013321065256, -2.0433263988709123]"
4,interestretailipi,train,"\n\n By Brad Schade, Reuters\nU.S. economic...",301,"[12.391053935848863, -1.4788300836056565]"


In [61]:
df.txt[0]

'\n\nInco Ltd said it did not expect its\nearlier reported removal from the Dow Jones industrial index to\nmake a major impact on the company\'s stock.\n    "We don\'t think that individuals or institutions buy our\nshares because we were one of the Dow Jones industrials,"\nspokesman Ken Cherney said in reply to a query.\n    Inco closed 1-3/8 lower at 19-3/8 in second most active\ntrading on the Toronto Stock Exchange.\n    The Wall Street Journal, which selects the index, said Inco\nwas dropped to make the index more representative of the\nmarket. Inco, the non-Communist world\'s largest nickel\nproducer, was a member of the index since 1928.\n    Replacing Inco and Owens-Illinois Inc will be Coca-Cola Co\nand Boeing Co, effective tomorrow.\n    Nickel analyst Ilmar Martens at Walwyn Stodgell Cochran\nMurray Ltd said Inco\'s removal from the index would likely\nspark short-term selling pressure on the stock.\n    "Some investors who have Inco may suddenly say, \'well,\nbecause it\'s 

In [62]:
def clean_text(txt):
    punctuation = "?.';:!()" + '"'
    txt = txt.split("\n")
    txt = [t.strip(" ") for t in txt]
    txt = "\n".join(txt)
    txt = txt.strip("\n")
    txt = txt.replace("\n", " ")
    for p in punctuation:
        txt = txt.replace(p, "")
    txt = txt.lower()
    return txt

In [63]:
clean_text(df.txt[0])

'inco ltd said it did not expect its earlier reported removal from the dow jones industrial index to make a major impact on the companys stock we dont think that individuals or institutions buy our shares because we were one of the dow jones industrials, spokesman ken cherney said in reply to a query inco closed 1-3/8 lower at 19-3/8 in second most active trading on the toronto stock exchange the wall street journal, which selects the index, said inco was dropped to make the index more representative of the market inco, the non-communist worlds largest nickel producer, was a member of the index since 1928 replacing inco and owens-illinois inc will be coca-cola co and boeing co, effective tomorrow nickel analyst ilmar martens at walwyn stodgell cochran murray ltd said incos removal from the index would likely spark short-term selling pressure on the stock some investors who have inco may suddenly say, well, because its not now a dow stock, we should eliminate that investment, said marte

In [7]:
sw = ENGLISH_STOP_WORDS
mod_vectorizer = CountVectorizer(stop_words=sw, preprocessor=clean_text)

dtm = mod_vectorizer.fit_transform(df.txt).toarray()
vocab = np.array(mod_vectorizer.get_feature_names())

In [8]:
vocab.shape

(50725,)

In [9]:
dtm.shape

(20578, 50725)

In [10]:
df.shape

(20578, 3)

In [11]:
df_vocab = pd.DataFrame(data=dtm, columns=vocab)

In [12]:
df["topic"] = [t.lower().strip() for t in df.topic]
df["topic"] = df["topic"].astype('category')
df["topic_category"] = df["topic"].cat.codes

In [13]:
df_vocab["topic_category"] = df.topic_category

In [14]:
df_vocab["train_test"] = df.train_test

In [15]:
df_vocab.head()

Unnamed: 0,00,000,0000,00000,0001,0003,000501,0006,0006910,0006913,...,zurack,zurich,zurn,zuyuan,zverev,zweig,zwermann,zy,topic_category,train_test
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,301,train


In [16]:
X_train = df_vocab[df["train_test"]=="train"].drop(labels=["topic_category", "train_test"], axis=1)
y_train = df_vocab[df["train_test"]=="train"]["topic_category"]

In [17]:
X_test = df_vocab[df["train_test"]=="test"].drop(labels=["topic_category", "train_test"], axis=1)
y_test = df_vocab[df["train_test"]=="test"]["topic_category"]

In [18]:
nb = MultinomialNB()

In [19]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
nb.score(X_test, y_test)

0.6668453976764969

In [22]:
pca = PCA(n_components=2)

In [24]:
vecs = pca.fit_transform(df_vocab.drop(axis=1, labels=["topic_category", "train_test"]))

In [27]:
df_vocab["topic_category"].value_counts()[0:2]

0      9801
101    3897
Name: topic_category, dtype: int64

In [29]:
df.topic.value_counts()[0:2]

        9801
earn    3897
Name: topic, dtype: int64

In [32]:
vecs.shape

(20578, 2)

In [50]:
top2.head()

Unnamed: 0,topic,train_test,txt,topic_category,vec
0,,train,\n\nInco Ltd said it did not expect its\nearli...,0,"[2.7918600967024747, -0.120579673331519]"
1,,train,"\n\nMason Benson, former president and\nchief ...",0,"[-2.2637614266081556, -1.1942673321327046]"
2,,train,\n\nIn a discovery that could complicate the\n...,0,"[-0.14117471403925372, -1.9846415052029434]"
3,,train,\n\nDoctors at the Centers for Disease\nContro...,0,"[0.19896013321065256, -2.0433263988709123]"
5,,train,"\n\nThe Reagan administration,\nresponding to ...",0,"[1.9683588186667766, -2.1856799453021414]"


In [33]:
vec_insert = [v for v in vecs]

In [36]:
df["vec"] = vec_insert

In [37]:
df.iloc[(df)]

In [49]:
top2 = df[df["topic_category"]<=1]

In [40]:
color_dict={0:'b', 1:'r'}

In [None]:
fig = plt.figure(figsize=(10, 10))
for vec, category, topic in zip(top2.vec, top2.topic_category, top2.topic):
    label = topic if topic is not None else "Zero"
    plt.scatter(vec[0], vec[1], c=color_dict[category], label=topic, alpha=0.5)
plt.legend()
plt.show()

In [75]:
len(df.topic.unique())

599

In [None]:
topic_dict = {"econ": ["econ", "retail"]}

In [None]:
models = []
for topic in df.topic.unique():
    ones = df[df["topic"] == topic] # ylabel=1
    zeros = df[df["topic"] != topic] # ylabel=0
    label = [1 if t==topic else 0 for t in df.topic]
    nb = GaussianNB()
    x = # slice from df
    y = # slice from df
    # train-test-split
    #fit and score
    models.append(nb)
    
    # creates a list of 600 models

In [None]:
for text in df.txt:
    c = []
    for model in models:
        p = model.predict(text)
        model.confidence()
        c.append(confidence)
        # normalization 