In [207]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re


# Constants
CSV_FILE_PATH = "utterance.csv"
KEYWORDS_CSV_FILE_PATH = 'keywords_tfidf.csv'
TOP_N_KEYWORDS = 50
TOP_N = 5
NUM_TESTS = 18000
CUSTOM_STOPWORDS = {}
ENABLE_LEMMATIZATION = True

# Download the stopwords if not already downloaded
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing function
def preprocess_text(text):
    try:
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        # Remove punctuation and stopwords
        tokens = [word for word in text.split() if word.isalnum() and word not in stop_words]
        # POS tagging
        pos_tags = nltk.pos_tag(tokens)
        # Lemmatize tokens with POS tags
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        return ' '.join(lemmatized_tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return text

# Load the keywords and utterance data
keywords_df = pd.read_csv('keywords.csv')
utterance_df = pd.read_csv('utterance.csv')

utterance_df['utterance'] = utterance_df['utterance'].apply(preprocess_text)

# Split the data into training and testing sets (80% training, 20% testing)
train_data, test_data = train_test_split(utterance_df, test_size=0.8, random_state=41)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [208]:
sample_size = min(10, len(intent_data))
display(utterance_df['utterance'].sample(n=sample_size))

13500                                           make order
15832                              issue make payment card
6552                need information talk customer service
19062    ask agent could inform fucking issue make paym...
13516                                ask agent place order
20520                         event postpone obtain refund
15304      could u ask agent inform problem payment please
Name: utterance, dtype: object

In [210]:
display(utterance_df['utterance'])

0                          dont online account register
1        tell regisger two account single email address
2                        online account open one please
3                   could ask agent open account please
4                        want online account create one
                              ...                      
21529                                  ship address set
21530                         ship address want set one
21531                         want set shipping address
21532                         dont ship address set one
21533                        wanna set shipping address
Name: utterance, Length: 21534, dtype: object

In [211]:
test = preprocess_text("i  wanna set up my shipping address")
display(test)

'wanna set shipping address'

In [212]:
# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words=list(stop_words), ngram_range=(1, 3))
X = vectorizer.fit_transform(train_data['utterance'])
y = train_data['intent']

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Step 5: Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Evaluate Model to get score predictions
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Step 4: Keyword Extraction
keywords_list = []

for (category, intent) in train_data[['category', 'intent']].drop_duplicates().itertuples(index=False):
    intent_data = train_data[(train_data['category'] == category) & (train_data['intent'] == intent)]
    intent_vectorizer = TfidfVectorizer(max_features=20, stop_words=list(stop_words), ngram_range=(1, 3))
    intent_X = intent_vectorizer.fit_transform(intent_data['utterance'])
    feature_names = intent_vectorizer.get_feature_names_out()
    tfidf_scores = intent_X.mean(axis=0).A1  # Average TF-IDF scores for each keyword

    for keyword, score in zip(feature_names, tfidf_scores):
        keywords_list.append({
            'category': category,
            'intent': intent,
            'keyword': keyword,
            'tfidf score': score
        })

# Step 5: Save Keywords
keywords_df = pd.DataFrame(keywords_list)
keywords_df.to_csv('keywords.csv', index=False)

print("Keywords have been extracted and saved to 'keywords.csv'.")


                          precision    recall  f1-score   support

            change_order       0.95      1.00      0.97        38
 change_shipping_address       1.00      0.67      0.80         6
  check_cancellation_fee       1.00      1.00      1.00        13
          check_invoices       0.94      0.88      0.91        34
   check_payment_methods       1.00      0.29      0.44         7
     check_refund_policy       1.00      0.82      0.90        17
               complaint       1.00      1.00      1.00        31
contact_customer_service       0.99      1.00      0.99        88
     contact_human_agent       1.00      1.00      1.00        38
          create_account       0.92      0.99      0.96        87
          delete_account       0.94      1.00      0.97        48
        delivery_options       0.83      1.00      0.91        15
         delivery_period       1.00      0.11      0.20         9
            edit_account       1.00      0.57      0.73         7
         

In [213]:
TOP_PREDICT = 1

# Create a dictionary for quick keyword lookup
keywords_dict = {}
for _, row in keywords_df.iterrows():
    if row['intent'] not in keywords_dict:
        keywords_dict[row['intent']] = set()
    keywords_dict[row['intent']].add(row['keyword'])

# Function to get top N predicted intents based on keywords
def get_top_n_intents(text, keywords_dict, n=1):
    #text = preprocess_text(text)
    intent_scores = {}
    keyword_used = {}
    for intent, keywords in keywords_dict.items():
        score = 0
        for keyword in keywords:
            if keyword in text:
                score += 1
                keyword_used[intent] = keyword
        if score > 0:
            intent_scores[intent] = score
    sorted_intents = sorted(intent_scores, key=intent_scores.get, reverse=True)
    return sorted_intents[:n], [keyword_used[intent] for intent in sorted_intents[:n]]

# Predict the top N intents for the test data
test_data['top_predict_intents'], test_data['keywords_used'] = zip(*test_data['utterance'].apply(lambda x: get_top_n_intents(x, keywords_dict, n=TOP_PREDICT)))

# Check if the actual intent is in the top N predicted intents
test_data['is_accurate'] = test_data.apply(lambda row: row['intent'] in row['top_predict_intents'], axis=1)

# Calculate the accuracy score
accuracy_score = test_data['is_accurate'].mean()

# Create the final table with all the required fields
final_table = test_data[['utterance', 'category', 'intent', 'top_predict_intents', 'keywords_used', 'is_accurate']]

# Display the final table and accuracy score
display(final_table)
print(f"Accuracy Score: {accuracy_score * 100:.2f}%")


Unnamed: 0,utterance,category,intent,top_predict_intents,keywords_used,is_accurate
4848,wanna view cancellation penalty,CANCELLATION_FEE,check_cancellation_fee,[check_cancellation_fee],[cancellation],True
7387,speak human,CONTACT,contact_human_agent,[contact_human_agent],[speak],True
16039,problem try pay online order ant notify,PAYMENT,payment_issue,[payment_issue],[pay],True
5559,want know number client service,CONTACT,contact_customer_service,[contact_customer_service],[number],True
111,tell could create two user account single emai...,ACCOUNT,create_account,[create_account],[email],True
...,...,...,...,...,...,...
12255,wanna request information see bill,INVOICES,get_invoice,[get_invoice],[bill],True
16583,problem try pay notify,PAYMENT,payment_issue,[payment_issue],[pay],True
5638,u show information email client service,CONTACT,contact_customer_service,[contact_customer_service],[email],True
20246,u ask agent request reimbursement,REFUNDS,get_refund,[get_invoice],[ask],False


Accuracy Score: 91.94%


In [223]:
import pandas as pd
from sklearn.metrics import classification_report

# Calculate the accuracy rate for each intent
intent_accuracy = test_data.groupby('intent')['is_accurate'].mean().sort_values(ascending=False)
intent_accuracy_df = intent_accuracy.reset_index()
intent_accuracy_df.columns = ['Intent', 'Accuracy Rate']
intent_accuracy_df['Accuracy Rate'] = intent_accuracy_df['Accuracy Rate'].round(2)

# Generate classification report for the model predictions
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report).transpose().reset_index()
report_df.columns = ['Intent', 'Precision', 'Recall', 'F1-Score', 'Support']
report_df[['Precision', 'Recall', 'F1-Score']] = report_df[['Precision', 'Recall', 'F1-Score']].round(2)
report_df['Support'] = report_df['Support']

# Merge the accuracy rate DataFrame with the classification report DataFrame
merged_df = pd.merge(intent_accuracy_df, report_df, on='Intent', how='left')

# Display the merged DataFrame
display(merged_df)


Unnamed: 0,Intent,Accuracy Rate,Precision,Recall,F1-Score,Support
0,review,0.99,1.0,1.0,1.0,20.0
1,newsletter_subscription,0.99,1.0,0.29,0.44,7.0
2,delivery_options,0.99,0.83,1.0,0.91,15.0
3,check_cancellation_fee,0.99,1.0,1.0,1.0,13.0
4,contact_customer_service,0.98,0.99,1.0,0.99,88.0
5,switch_account,0.98,1.0,0.85,0.92,13.0
6,payment_issue,0.97,0.94,1.0,0.97,171.0
7,get_invoice,0.97,0.86,1.0,0.92,48.0
8,cancel_order,0.97,,,,
9,contact_human_agent,0.96,1.0,1.0,1.0,38.0
