In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from utils.file_utils import get_cleaned_tickets_as_df

In [None]:
tickets_df = get_cleaned_tickets_as_df(path="./data/cleaned_tickets_v4.csv")
tickets_df

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
from utils.text_utils import clean_text, lemmmatize_text, get_pos_tags, clean_stop_words

tickets_df["message_cleaned"] = tickets_df.message.apply(clean_text)
tickets_df["message_lemmatized"] = tickets_df.message_cleaned.apply(lemmmatize_text)
tickets_df["message_pos"] = tickets_df.message_lemmatized.apply(get_pos_tags)
tickets_df["message_stop"] = tickets_df.message_pos.apply(clean_stop_words)
tickets_df = tickets_df[tickets_df["message_stop"].str.len() > 50]
tickets_df

In [None]:
df_clean = tickets_df[["message_stop", "tags"]]

# reverse_topic_mapping = {
#     "Spam": "0",
#     "Sales": "1",
#     "Feature Request": "2",
#     "Bug": "3",
#     "Product Question": "4",
# }
# # Replace Topics with Topic Names
# df_clean["tags"] = df_clean["tags"].map(reverse_topic_mapping)

df_clean

In [None]:
# Split data into features and labels
X = df_clean["message_stop"]
y = df_clean["tags"]

# Split into training and testing sets (note: with very small dataset, train/test split is just for demonstration)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Build a pipeline
pipeline = Pipeline(
    [
        ("count_vectorizer", CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ("classifier", LogisticRegression(random_state=40, solver="liblinear")),
    ]
)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Build a pipeline
pipeline = Pipeline(
    [
        ("count_vectorizer", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        # ("tfidf", TfidfVectorizer(min_df=2, max_df=0.95, stop_words="english")),
        ("classifier", MultinomialNB()),
    ]
)

# fit model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Build a pipeline
pipeline = Pipeline(
    [
        #("count_vectorizer", CountVectorizer()),
        ("tfidf", TfidfVectorizer(min_df=2, max_df=0.95, stop_words="english")),
        ("classifier", DecisionTreeClassifier()),
    ]
)

# fit model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(classification_report(y_test, y_pred))