In [59]:
import numpy as np
import json
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder

In [13]:
def load_data():
    with open("./training.json") as f:
        lines = f.readlines()
        data = [json.loads(line) for line in lines]

    return pd.DataFrame.from_dict(data)

df = load_data()
df.tail()

Unnamed: 0,topic,question,excerpt
20214,wordpress,How to set a Custom Post Type as the parent of...,I have a Custom Post Type called Recipe with p...
20215,wordpress,Tracking last login and last visit,I'm using the code below to track when a user ...
20216,wordpress,How to exclude the particular category from th...,"add_action( 'pre_get_posts', 'custom_pre_get_p..."
20217,wordpress,display sub categories assoccited with each po...,i have wordpress blog with many posts. each po...
20218,wordpress,Lost of query parameter when using permalink,"I have many issues with the use of rewriting, ..."


In [15]:


def train_label_encoder(df:pd.DataFrame, target:str="topic") -> LabelEncoder:
    encoder = LabelEncoder()
    encoder.fit(df[target])
    return encoder

encoder = train_label_encoder(df)
df["y"] = encoder.transform(df["topic"])

In [62]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

@np.vectorize
def preprocess_sentence(sentence:str) -> str:
    sentence = sentence.replace("\n", "").lower()
    sentence_tokens = sentence.split(" ")
    sentence_filtered = [w for w in sentence_tokens if not w.lower() in stop_words]
    return " ".join(sentence_filtered)

df["sentence"] = df["question"] + " " + df["excerpt"]
df["sentence"] = preprocess_sentence(df["sentence"])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joao.guedes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
from sklearn.feature_extraction.text import CountVectorizer

def train_vectorizer(df:pd.DataFrame, column:str="sentence") -> CountVectorizer:
    vectorizer = CountVectorizer()
    vectorizer.fit(df[column].tolist())
    return vectorizer

vectorizer = train_vectorizer(df)

In [48]:
# create the bag-of-words model
bow_model = vectorizer.transform(df["sentence"].tolist())

In [49]:


def train_svd(X, n_components) -> TruncatedSVD:
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(X)
    return svd

decomposition = train_svd(bow_model, n_components=200)
X_preprocessed = decomposition.transform(bow_model)

In [37]:
y = df["y"]

In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0, max_iter=1000)
model.fit(X_preprocessed, y)


LogisticRegression(max_iter=1000, random_state=0)

In [65]:
def predict(sentence:str) -> str:
    sentence = preprocess_sentence(sentence)
    bow_model = vectorizer.transform(sentence)
    X_preprocessed = decomposition.transform(bow_model)
    label = model.predict(X_preprocessed)
    return encoder.inverse_transform(label)

df["y_pred"] = predict(df["sentence"])



In [67]:
df[["topic", "y_pred"]]

Unnamed: 0,topic,y_pred
0,electronics,electronics
1,electronics,electronics
2,electronics,electronics
3,electronics,electronics
4,electronics,electronics
...,...,...
20214,wordpress,wordpress
20215,wordpress,wordpress
20216,wordpress,wordpress
20217,wordpress,wordpress
