In [53]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re


# Constants
CSV_FILE_PATH = "utterance.csv"
KEYWORDS_CSV_FILE_PATH = 'keywords.csv'
TOP_N_KEYWORDS = 200
TOP_PREDICT = 1
ENABLE_LEMMATIZATION = True

# Download the stopwords if not already downloaded
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Preprocessing function
def preprocess_text(text):
    try:
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        # Remove punctuation and stopwords
        tokens = [word for word in text.split() if word.isalnum() and word not in stop_words]
        # POS tagging
        pos_tags = nltk.pos_tag(tokens)
        # Lemmatize tokens with POS tags
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        return ' '.join(lemmatized_tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return text

# Load the keywords and utterance data
#keywords_df = pd.read_csv('keywords.csv')
utterance_df = pd.read_csv('utterance.csv')

utterance_df['utterance'] = utterance_df['utterance'].apply(preprocess_text)




[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
from sklearn.model_selection import StratifiedShuffleSplit

# Split the data into training and testing sets (80% training, 20% testing) while maintaining the distribution of each intent type
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)
for train_index, test_index in split.split(utterance_df, utterance_df['intent']):
    train_data = utterance_df.loc[train_index]
    test_data = utterance_df.loc[test_index]

In [55]:
# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=200, stop_words=list(stop_words), ngram_range=(1, 1))
X = vectorizer.fit_transform(train_data['utterance'])
y = train_data['intent']

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Step 5: Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Evaluate Model to get score predictions
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

                          precision    recall  f1-score   support

            cancel_order       0.00      0.00      0.00         8
            change_order       0.94      0.99      0.96       151
 change_shipping_address       1.00      1.00      1.00        19
  check_cancellation_fee       1.00      1.00      1.00        59
          check_invoices       0.98      0.81      0.89       154
   check_payment_methods       1.00      0.80      0.89        45
     check_refund_policy       1.00      1.00      1.00        74
               complaint       0.98      0.99      0.99       117
contact_customer_service       1.00      1.00      1.00       324
     contact_human_agent       1.00      0.98      0.99       166
          create_account       0.97      0.99      0.98       367
          delete_account       0.94      0.95      0.94       132
        delivery_options       1.00      0.98      0.99        63
         delivery_period       1.00      1.00      1.00        22
         

In [56]:
def extract_and_save_keywords(train_data, stop_words, max_features, ngram_range, output_file):
    keywords_list = []

    for (category, intent) in train_data[['category', 'intent']].drop_duplicates().itertuples(index=False):
        intent_data = train_data[(train_data['category'] == category) & (train_data['intent'] == intent)]
        intent_vectorizer = TfidfVectorizer(max_features=max_features, stop_words=list(stop_words), ngram_range=ngram_range)
        intent_X = intent_vectorizer.fit_transform(intent_data['utterance'])
        feature_names = intent_vectorizer.get_feature_names_out()
        tfidf_scores = intent_X.mean(axis=0).A1  # Average TF-IDF scores for each keyword

        for keyword, score in zip(feature_names, tfidf_scores):
            keywords_list.append({
                'category': category,
                'intent': intent,
                'keyword': keyword,
                'tfidf score': score
            })

    keywords_df = pd.DataFrame(keywords_list)
    keywords_df.to_csv(output_file, index=False)


    print(f"Keywords have been extracted and saved to '{output_file}'.")

# Example usage
extract_and_save_keywords(train_data, stop_words, max_features=TOP_N_KEYWORDS, ngram_range=(1, 1), output_file=KEYWORDS_CSV_FILE_PATH)
keywords_df = pd.read_csv(KEYWORDS_CSV_FILE_PATH)
display(keywords_df)

Keywords have been extracted and saved to 'keywords.csv'.


Unnamed: 0,category,intent,keyword,tfidf score
0,ACCOUNT,delete_account,acccount,0.000890
1,ACCOUNT,delete_account,accopunt,0.001221
2,ACCOUNT,delete_account,accounnt,0.001138
3,ACCOUNT,delete_account,account,0.193698
4,ACCOUNT,delete_account,accountg,0.001128
...,...,...,...,...
2341,ORDER,cancel_order,please,0.055878
2342,ORDER,cancel_order,purchase,0.222163
2343,ORDER,cancel_order,wanna,0.031066
2344,ORDER,cancel_order,want,0.228852


In [57]:
# Create a dictionary for quick keyword lookup
keywords_dict = {}
for _, row in keywords_df.iterrows():
    if row['intent'] not in keywords_dict:
        keywords_dict[row['intent']] = set()
    keywords_dict[row['intent']].add(row['keyword'])

# Function to get top N predicted intents based on keywords
def get_top_n_intents(text, keywords_dict, n=1):
    #text = preprocess_text(text)
    intent_scores = {}
    keyword_used = {}
    for intent, keywords in keywords_dict.items():
        score = 0
        for keyword in keywords:
            if keyword in text:
                score += 1
                keyword_used[intent] = keyword
        if score > 0:
            intent_scores[intent] = score
    sorted_intents = sorted(intent_scores, key=intent_scores.get, reverse=True)
    return sorted_intents[:n], [keyword_used[intent] for intent in sorted_intents[:n]]

# Predict the top N intents for the test data
test_data['top_predict_intents'], test_data['keywords_used'] = zip(*test_data['utterance'].apply(lambda x: get_top_n_intents(x, keywords_dict, n=TOP_PREDICT)))

# Check if the actual intent is in the top N predicted intents
test_data['is_accurate'] = test_data.apply(lambda row: row['intent'] in row['top_predict_intents'], axis=1)

# Calculate the accuracy score
accuracy_score = test_data['is_accurate'].mean()

# Create the final table with all the required fields
final_table = test_data[['utterance', 'category', 'intent', 'top_predict_intents', 'keywords_used', 'is_accurate']]

# Display the final table and accuracy score
display(final_table)
print(f"Accuracy Score: {accuracy_score * 100:.2f}%")


Unnamed: 0,utterance,category,intent,top_predict_intents,keywords_used,is_accurate
10955,informatipn tell request invoice,INVOICES,get_invoice,[contact_customer_service],[fo],False
6931,question help send email customer service,CONTACT,contact_customer_service,[contact_customer_service],[se],True
9724,im happy service make review,FEEDBACK,review,[complaint],[service],False
9350,help leave comment,FEEDBACK,review,[track_order],[hel],False
3841,someone steal fuck account password could retr...,ACCOUNT,recover_password,[create_account],[please],False
...,...,...,...,...,...,...
14538,check status ordet,ORDER,track_order,[track_refund],[check],False
9521,write review service,FEEDBACK,review,[check_invoices],[se],False
12608,order product need help modify order,ORDER,change_order,[change_order],[od],True
4886,ask agent check cancellation fee,CANCELLATION_FEE,check_cancellation_fee,[check_cancellation_fee],[check],True


Accuracy Score: 81.54%


In [58]:
def calculate_intent_accuracy(test_data):
    # Calculate the accuracy rate for each intent
    intent_accuracy = test_data.groupby('intent')['is_accurate'].mean().sort_values(ascending=False)
    intent_accuracy_df = intent_accuracy.reset_index()
    intent_accuracy_df.columns = ['Intent', 'Accuracy Rate']
    intent_accuracy_df['Accuracy Rate'] = intent_accuracy_df['Accuracy Rate'].round(2)
    return intent_accuracy_df

def generate_classification_report(y_test, y_pred):
    # Generate classification report for the model predictions
    report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)
    # Convert the classification report to a DataFrame
    report_df = pd.DataFrame(report).transpose().reset_index()
    report_df.columns = ['Intent', 'Precision', 'Recall', 'F1-Score', 'Support']
    report_df[['Precision', 'Recall', 'F1-Score']] = report_df[['Precision', 'Recall', 'F1-Score']].round(2)
    return report_df

def merge_reports(intent_accuracy_df, report_df, train_data):
    # Merge the accuracy rate DataFrame with the classification report DataFrame
    merged_df = pd.merge(intent_accuracy_df, report_df, on='Intent', how='left')
    # Add the category of the issue
    intent_category = train_data[['intent', 'category']].drop_duplicates()
    intent_category.columns = ['Intent', 'Category']
    # Merge with the existing merged_df
    final_report_df = pd.merge(merged_df, intent_category, on='Intent', how='left')
    return final_report_df

# Calculate intent accuracy
intent_accuracy_df = calculate_intent_accuracy(test_data)

# Generate classification report
report_df = generate_classification_report(y_test, y_pred)

# Merge reports
final_report_df = merge_reports(intent_accuracy_df, report_df, train_data)

# Display the final DataFrame
display(final_report_df)


Unnamed: 0,Intent,Accuracy Rate,Precision,Recall,F1-Score,Support,Category
0,change_shipping_address,1.0,1.0,1.0,1.0,19.0,SHIPPING
1,contact_customer_service,1.0,1.0,1.0,1.0,324.0,CONTACT
2,payment_issue,1.0,0.98,1.0,0.99,736.0,PAYMENT
3,contact_human_agent,0.96,1.0,0.98,0.99,166.0,CONTACT
4,complaint,0.96,0.98,0.99,0.99,117.0,FEEDBACK
5,set_up_shipping_address,0.95,1.0,1.0,1.0,14.0,SHIPPING
6,create_account,0.93,0.97,0.99,0.98,367.0,ACCOUNT
7,check_invoices,0.92,0.98,0.81,0.89,154.0,INVOICES
8,change_order,0.82,0.94,0.99,0.96,151.0,ORDER
9,delete_account,0.78,0.94,0.95,0.94,132.0,ACCOUNT


In [59]:
from collections import Counter

# Step 1: Identify the incorrect predictions
incorrect_predictions = test_data[~test_data['is_accurate']]

# Step 2: Extract the keywords used in those incorrect predictions
incorrect_keywords = incorrect_predictions.explode('keywords_used')

# Step 3: Count the frequency of each keyword in the incorrect predictions
keyword_counts = incorrect_keywords.groupby('intent')['keywords_used'].apply(lambda x: Counter(x).most_common())

# Step 4: Create a DataFrame to display the keywords that caused the most inaccurate matches per intent
most_inaccurate_keywords = keyword_counts.apply(lambda x: x[0] if x else ('None', 0)).apply(pd.Series)
most_inaccurate_keywords.columns = ['Keyword', 'Count']

# Include the intent in the DataFrame
most_inaccurate_keywords['Intent'] = most_inaccurate_keywords.index

# Display the result
display(most_inaccurate_keywords)

# Extract the most common inaccurate keywords to use as stopwords for each intent
new_stopwords_dict = most_inaccurate_keywords.set_index('Intent')['Keyword'].to_dict()

# Save the new stopwords to a file
with open('new_stopwords.txt', 'w') as f:
    for intent, word in new_stopwords_dict.items():
        f.write(f"{intent}: {word}\n")

print("New stopwords have been saved to 'new_stopwords.txt'.")
display(new_stopwords_dict)

Unnamed: 0_level_0,Keyword,Count,Intent
intent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cancel_order,se,3,cancel_order
change_order,se,14,change_order
check_cancellation_fee,se,15,check_cancellation_fee
check_invoices,bil,5,check_invoices
check_payment_methods,pay,20,check_payment_methods
check_refund_policy,iew,7,check_refund_policy
complaint,hel,3,complaint
contact_human_agent,help,3,contact_human_agent
create_account,se,5,create_account
delete_account,dont,14,delete_account


New stopwords have been saved to 'new_stopwords.txt'.


{'cancel_order': 'se',
 'change_order': 'se',
 'check_cancellation_fee': 'se',
 'check_invoices': 'bil',
 'check_payment_methods': 'pay',
 'check_refund_policy': 'iew',
 'complaint': 'hel',
 'contact_human_agent': 'help',
 'create_account': 'se',
 'delete_account': 'dont',
 'delivery_options': 'se',
 'delivery_period': 'tel',
 'edit_account': 'information',
 'get_invoice': 'se',
 'get_refund': 'reimbursemen',
 'newsletter_subscription': 'fo',
 'payment_issue': 'hel',
 'place_order': 'ake',
 'recover_password': 'accoun',
 'registration_problems': 'accoun',
 'review': 'se',
 'set_up_shipping_address': 'se',
 'switch_account': 'accoun',
 'track_order': 'know',
 'track_refund': 'reimbursemen'}

In [60]:
# Update the stopwords with the new stopwords per intent
updated_stop_words = stop_words.union(new_stopwords)

# Preprocess the text with the updated stopwords
def preprocess_text_with_new_stopwords(text, stop_words):
    try:
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        # Remove punctuation and stopwords
        tokens = [word for word in text.split() if word.isalnum() and word not in stop_words]
        # POS tagging
        pos_tags = nltk.pos_tag(tokens)
        # Lemmatize tokens with POS tags
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        return ' '.join(lemmatized_tokens)
    except Exception as e:
        print(f"Error processing text: {e}")
        return text

# Apply the updated preprocessing function to the utterance data
utterance_df['utterance'] = utterance_df['utterance'].apply(lambda x: preprocess_text_with_new_stopwords(x, updated_stop_words))

# Split the data again
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)
for train_index, test_index in split.split(utterance_df, utterance_df['intent']):
    train_data = utterance_df.loc[train_index]
    test_data = utterance_df.loc[test_index]

# TF-IDF Vectorization with updated stopwords
vectorizer = TfidfVectorizer(max_features=200, stop_words=list(updated_stop_words), ngram_range=(1, 1))
X = vectorizer.fit_transform(train_data['utterance'])
y = train_data['intent']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate Model to get score predictions
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))
display(utterance_df['utterance'])

                          precision    recall  f1-score   support

            cancel_order       0.00      0.00      0.00         8
            change_order       0.94      0.99      0.97       151
 change_shipping_address       1.00      1.00      1.00        19
  check_cancellation_fee       1.00      1.00      1.00        59
          check_invoices       0.98      0.80      0.88       154
   check_payment_methods       1.00      0.91      0.95        45
     check_refund_policy       1.00      1.00      1.00        74
               complaint       0.98      0.99      0.99       117
contact_customer_service       0.99      1.00      1.00       324
     contact_human_agent       1.00      0.98      0.99       166
          create_account       0.97      0.99      0.98       367
          delete_account       0.95      0.96      0.96       132
        delivery_options       1.00      0.97      0.98        63
         delivery_period       1.00      1.00      1.00        22
         

0                               online account register
1        tell regisger two account single email address
2                        online account open one please
3                   could ask agent open account please
4                        want online account create one
                              ...                      
21529                                  ship address set
21530                         ship address want set one
21531                         want set shipping address
21532                              ship address set one
21533                        wanna set shipping address
Name: utterance, Length: 21534, dtype: object