### Import Libraries

In [164]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import statistics
import os
import spacy
import gensim
from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity

### Read the train data

In [161]:
train_df=pd.DataFrame()
for file in os.listdir():
    if file.split('.')[-1]=='xlsx' and file.split('_')[0]=='train':
        temp=pd.read_excel(file)
        train_df=pd.concat([temp,train_df])

In [162]:
val_df=pd.DataFrame()
for file in os.listdir():
    if file.split('.')[-1]=='xlsx' and file.split('_')[0]=='val':
        temp=pd.read_excel(file)
        val_df=pd.concat([temp,val_df])

### Take data subset with 50 observations from each category and apply lemmatization, stop word removal with spacy. Removing "Bad Customer Service" class

In [165]:
df50=pd.DataFrame()
for lab in train_df['label'].unique():
    if lab not in ['bad customer service','customer feedback']:
        df=train_df[train_df['label']==lab].iloc[:50]
        df50=pd.concat([df,df50])

In [None]:
sent_list=df50['text'].tolist()

import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

filtered_sent_list = []

for idx,sentence in enumerate(sent_list):
    # Process the sentence with spaCy
    doc = nlp(sentence)
    
    # Remove stopwords
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop]
    filtered_tokens=[token for token in filtered_tokens if token in vocab_words]
    
    # Join the filtered tokens into a new sentence
    filtered_sentence = " ".join(filtered_tokens)
    
    # Append the filtered sentence to the list
    filtered_sent_list.append(filtered_sentence)

### Apply tfidf method to get the top-10 words

In [None]:
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the sentences
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_sent_list)

# Get feature names (words)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

In [None]:
def get_top_n_words(sentence, n):
    # Transform the given sentence using the trained TF-IDF model
    tfidf_values = tfidf_vectorizer.transform([sentence])

    # Convert the TF-IDF values to a dense array
    tfidf_dense = tfidf_values.todense()

    # Get the indices of the top N words
    top_indices = np.argsort(tfidf_dense)[0, -n:][::-1]
    top_indices=[i for i in top_indices.tolist()[0]]

    # Get the corresponding words from the feature names
    top_words = [feature_names[i] for i in top_indices]

    return top_words


## Generate top 10 words list

top10_words_list=[]
for sent in filtered_sent_list:
    top10_words_list.append(get_top_n_words(sent,10))

### Using word2vec embeddings for similarity and create a similarity summary

In [None]:
model_path = 'GoogleNews-vectors-negative300.bin.gz'

# Load the pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
label_list=df50['label'].tolist()

final_sim_list=[]
for idx,words in enumerate(top10_words_list):
    print(idx)
    sim_dict={}
    for lab in list(set(label_list)):
        sim_list=[]
        for word in words:
            if all(word in word2vec_model.key_to_index for word in [word] + lab.split()):
                word_vec = word2vec_model[word]
                lab_vec=word2vec_model[lab.split()].mean(axis=0)
                
                similarity = cosine_similarity([word_vec], [lab_vec])[0][0]
                
                sim_list.append(similarity)
        if len(sim_list)>0:
            sim_dict[lab]=(max(sim_list),statistics.mean(sim_list))
        else:
            sim_dict[lab]=(0,0)
        
    final_sim_list.append(sim_dict)

In [None]:
final_list=[]
for idx,i in enumerate(final_sim_list):
    temp={}
    temp['predicted']=[(key, final_sim_list[idx][key]) for key in sorted(final_sim_list[idx], key=lambda k: final_sim_list[idx][k][0], reverse=True)[:5]]
    temp['actual']=(label_list[idx],final_sim_list[idx][label_list[idx]])
    final_list.append(temp) 

### Final Cluster Categories after creation

In [9]:
cluster_1=['account cancellation','account security','login issues',
           'forgot my password','software update']   


cluster_2=['best buy credit card','payment failed','billing or charge disputes','cancel order',
'unauthorized charge or payment','refund request','fraud concerns','return request',
'cancellation of a plan subscription or membership','account cancellation','change or update order',
'schedule order pickup','change shipping time','delivery tracking','refund status',
'change payment method','payment method','change shipping address',
'delivery or parts of delivery items missing','renewal of a plan subscription or membership',
'reschedule delivery','reschedule order pickup','rewards or discounts','schedule delivery',
'trade in inquiry','delivery delays']


cluster_3=['check warranty coverage','damaged product','warranty claim','reschedule repair',
'device damaged','incomplete installation','lost or forgot items','reschedule installation',
'schedule repair','screen issues','software error','software installation','schedule installation',
'troubleshooting','performance issues','defective product']


cluster_4=['employment or career inquiries','website or app complaints','incomplete installation',
'miscellaneous inquiries','network or connectivity issues','customer feedback','bad customer service']


cluster_5=['price match','product availability and stock','product compatibility',
'product details inquiry','transfer call to the right department or store']

