## In this notebook, we reduce the ChatGPT vocab by discarding words that are present less than 1% of the time
## (Note we do this for the full 350k, as well as for each cluster)

In [None]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import time

seed = 1

In [None]:
# load data
load_path = "../data/clean/training_data_cleaned.xlsx"
df_full = pd.read_excel(load_path, index_col=0)

In [None]:
# sample some data
# df = df_full.sample(n=int(1e4), random_state=seed, ignore_index=True)
df = df_full

Unnamed: 0,label,text
0,troubleshooting,agent say contact good buy start phone number ...
1,schedule installation,agent say phone number customer say marjorie s...
2,trade in inquiry,agent say thank call good buy telephone number...
3,account security,agent say um sorry ask complete phone number p...
4,product availability and stock,agent say thank contact good buy chad start ph...
...,...,...
379323,rewards or discounts,agent say thank call good buy date ask phone n...
379324,product details inquiry,agent say moment phone number please.customer ...
379325,schedule repair,agent say call phone number please.customer sa...
379326,troubleshooting,agent say hi thank call good buy start phone n...


In [None]:
# clusters
# note: cluster names are not representative of the classes they contain
clusters = {
    "authorization": [
        "account cancellation",
        "account security",
        "login issues",
        "forgot my password",
        "software update",
    ],
    "order related and payments": [
        "best buy credit card",
        "payment failed",
        "billing or charge disputes",
        "cancel order",
        "unauthorized charge or payment",
        "refund request",
        "fraud concerns",
        "return request",
        "cancellation of a plan subscription or membership",
        "account cancellation",
        "change or update order",
        "schedule order pickup",
        "change shipping time",
        "delivery tracking",
        "refund status",
        "change payment method",
        "payment method",
        "change shipping address",
        "delivery or parts of delivery items missing",
        "renewal of a plan subscription or membership",
        "reschedule delivery",
        "reschedule order pickup",
        "rewards or discounts",
        "schedule delivery",
        "trade in inquiry",
        "delivery delays",
    ],
    "warranty": [
        "check warranty coverage",
        "damaged product",
        "warranty claim",
        "reschedule repair",
        "device damaged",
        "incomplete installation",
        "lost or forgot items",
        "reschedule installation",
        "schedule repair",
        "screen issues",
        "software error",
        "software installation",
        "schedule installation",
        "troubleshooting",
        "performance issues",
        "defective product",
    ],
    "queries regarding website": [
        "employment or career inquiries",
        "website or app complaints",
        "incomplete installation",
        "miscellaneous inquiries",
        "network or connectivity issues",
        "customer feedback",
        "bad customer service",
    ],
    "product queries": [
        "price match",  # 6759
        "product availability and stock",  # 37972
        "product compatibility",  # 10897
        "product details inquiry",  # 42698
        "transfer call to the right department or store",  # 5869
    ],
}

# create a reverse lookup dict,
# i.e. keys are labels and values are cluster labels
cluster_lookup_dict = {}
for cluster_label, v in clusters.items():
    for label in v:
        cluster_lookup_dict[label] = cluster_label

In [None]:
df["cluster_label"] = df["label"].apply(lambda x: cluster_lookup_dict[x])

In [None]:
df["cluster_label"].value_counts()

cluster_label
order related and payments    126932
warranty                      124098
product queries               104195
queries regarding website      15504
authorization                   8599
Name: count, dtype: int64

### Apply TF-IDF with the custom vocab

In [None]:
# load the vocab
vocab_path = "../data/interim/vocabs/vocab_full_5352words.json"
with open(vocab_path) as f:
    vocab = json.load(f)

### Dataframes for each cluster

In [None]:
dfs = {}  # Dicts with key,val pairs (cluster_name:fltered_df)
for c in list(clusters.keys()):
    df_filtered = df.loc[df["cluster_label"] == c]
    dfs[c] = df_filtered

# dfs

### Reduce Vocab for each cluster

In [None]:
cluster_vocab = {}
for cluster, df_cluster in dfs.items():
    # tfidf for filtered df corresp to each cluster
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), vocabulary=vocab)
    tfidf = vectorizer.fit_transform(df_cluster["text"])
    print(tfidf.shape)
    length_cluster = len(df_cluster)
    corpus_index = list(range(length_cluster))
    df_tfidf_raw = pd.DataFrame(
        tfidf.transpose().todense(), index=vocab, columns=corpus_index
    )
    print("converted to dense")
    df_tfidf = df_tfidf_raw[[0, 1]].copy()
    # non-zero tfd
    df_tfidf["non_zero_tfs"] = (df_tfidf_raw > 0).sum(axis=1)
    tf_bools = df_tfidf["non_zero_tfs"].copy()
    tf_bools.sort_values(ascending=False, inplace=True)
    tf_bools.to_dict()
    # reduce vocab
    percent = 1
    count = int(length_cluster * percent / 100)
    high_freq_words = df_tfidf[df_tfidf["non_zero_tfs"] >= count]
    cluster_vocab[cluster] = high_freq_words
    high_freq_vocab = high_freq_words.index.to_list()
    # sace vocab
    save_path = f"../data/interim/vocabs/vocab_{cluster}_{percent}percent_{len(high_freq_vocab)}words.json"
    with open(save_path, "w") as f:
        json.dump(high_freq_vocab, f)

(8599, 5352)
converted to dense
