In [74]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD


In [64]:


filepath = "./trainingdata.txt"

def read_data(filepath:str) -> pd.DataFrame:
    lines = []
    with open(filepath) as f:
        lines = f.readlines()

    df = pd.DataFrame(columns=["category", "document"])

    for line in lines[1:]:
        tokens = line.split(" ")
        category = tokens[0]
        document = " ".join(tokens[1:]).replace("\n", "")
        df.loc[df.shape[0]] = [category, document]

    return df

df = read_data(filepath)


In [65]:
from sklearn.feature_extraction.text import CountVectorizer

def train_vectorizer(df:pd.DataFrame) -> CountVectorizer:
    vectorizer = CountVectorizer()
    vectorizer.fit(df["document"].tolist())
    return vectorizer

vectorizer = train_vectorizer(df)

In [67]:
# create the bag-of-words model
bow_model = vectorizer.transform(df["document"].tolist())

In [72]:
bow_model

<5485x19956 sparse matrix of type '<class 'numpy.int64'>'
	with 320397 stored elements in Compressed Sparse Row format>

In [75]:


def train_svd(X, n_components) -> TruncatedSVD:
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(X)

    return svd

X = bow_model.toarray()
y = df.category.astype(int).values


In [76]:
pca = train_svd(bow_model, n_components=50)
X_preprocessed = pca.transform(X)

In [55]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(X_preprocessed, y)


In [61]:
model.score(X_preprocessed, y)

0.9549680948040109

In [57]:
model.predict(X_preprocessed)

array([1, 2, 1, ..., 1, 8, 4])