In [None]:
from utils.file_utils import get_tickets_as_df

# Data exploration

In [None]:
tickets_df = get_tickets_as_df()
tickets_df.head(100)

In [None]:
# Just investigate the tags, need to check if one ticket has multiple tags
tickets_df.groupby("ticket_number")["tags"].unique()
# conclusion:
# one ticket has multiple tags

In [None]:
# just investigate the tags and the counts
tickets_df["tags"].value_counts()
# conclusion:
# there's a lot of tags with low frequency
# so they are not very useful
# they can be dropped

# Feature engineering

In [None]:
cleaned_tickets_df = tickets_df[
    ~tickets_df["tags"].isnull()
]  # remove tags rows with null

cleaned_tickets_df = cleaned_tickets_df[
    ~cleaned_tickets_df["message"].isnull()
]  # remove message rows with null

cleaned_tickets_df = cleaned_tickets_df[
    cleaned_tickets_df["tags"].str.contains("Spam")
    | cleaned_tickets_df["tags"].str.contains("Bug")
    | cleaned_tickets_df["tags"].str.contains("Product Question")
    | cleaned_tickets_df["tags"].str.contains("Feature Request")
    | cleaned_tickets_df["tags"].str.contains("Sales")
]  # filter out spam, bug, product question, feature request, sales


cleaned_tickets_df["tags"].value_counts()
# we still can see a lot of tags with low frequency

In [None]:
def map_categories(cat: str) -> str:
    if "Bug" in cat:
        return "Bug"
    elif "Product Question" in cat:
        return "Product Question"
    elif "Feature Request" in cat:
        return "Feature Request"
    elif "Sales" in cat:
        return "Sales"
    elif "Spam" in cat:
        return "Spam"
    else:
        return "Other"


cleaned_tickets_df["tags"] = cleaned_tickets_df["tags"].apply(map_categories)
cleaned_tickets_df["tags"].value_counts()
# we can see that we only have 5 categories
# and they are: Bug, Product Question, Feature Request, Sales, Spam

In [None]:
cleaned_tickets_df.head()

# Save to csv

In [None]:
cleaned_tickets_df.to_csv("data/cleaned_tickets_v1.csv", index=False)