In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import re
import nltk

# Download the stopwords if not already downloaded
nltk.download('stopwords')

# Constants
CSV_FILE_PATH = "utterance.csv"
KEYWORDS_CSV_FILE_PATH = 'keywords_tfidf.csv'
TOP_N_KEYWORDS = 50
TOP_N = 5
NUM_TESTS = 18000
CUSTOM_STOPWORDS = {}

# Load the keywords and utterance data
keywords_df = pd.read_csv('keywords.csv')
utterance_df = pd.read_csv('utterance.csv')

# Split the data into training and testing sets (80% training, 20% testing)
train_data, test_data = train_test_split(utterance_df, test_size=0.8, random_state=41)

def preprocess_text(text, stop_words):
    """
    Preprocess the input text by converting to lowercase, removing punctuation, and stopwords.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Step 1: Load Data
data = train_data

# Step 2: Preprocess Data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

data['utterance'] = data['utterance'].apply(preprocess_text)

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words=list(stop_words), ngram_range=(1, 3))
X = vectorizer.fit_transform(data['utterance'])
y = data['intent']

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Evaluate Model to get score predictions
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Step 4: Keyword Extraction
keywords_list = []

for (category, intent) in data[['category', 'intent']].drop_duplicates().itertuples(index=False):
    intent_data = data[(data['category'] == category) & (data['intent'] == intent)]
    intent_vectorizer = TfidfVectorizer(max_features=20, stop_words=list(stop_words), ngram_range=(1, 3))
    intent_X = intent_vectorizer.fit_transform(intent_data['utterance'])
    feature_names = intent_vectorizer.get_feature_names_out()
    tfidf_scores = intent_X.mean(axis=0).A1  # Average TF-IDF scores for each keyword

    for keyword, score in zip(feature_names, tfidf_scores):
        keywords_list.append({
            'category': category,
            'intent': intent,
            'keyword': keyword,
            'tfidf score': score
        })

# Step 5: Save Keywords
keywords_df = pd.DataFrame(keywords_list)
keywords_df.to_csv('keywords.csv', index=False)

print("Keywords have been extracted and saved to 'keywords.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\czer3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                          precision    recall  f1-score   support

            cancel_order       0.00      0.00      0.00         1
            change_order       0.95      0.97      0.96        37
 change_shipping_address       1.00      0.80      0.89         5
  check_cancellation_fee       1.00      0.93      0.96        14
          check_invoices       0.97      0.85      0.91        40
   check_payment_methods       1.00      0.50      0.67        12
     check_refund_policy       1.00      0.85      0.92        27
               complaint       1.00      1.00      1.00        28
contact_customer_service       0.97      1.00      0.98        86
     contact_human_agent       1.00      0.97      0.99        38
          create_account       0.91      0.99      0.95        87
          delete_account       0.94      0.88      0.91        34
        delivery_options       0.81      1.00      0.90        13
         delivery_period       0.00      0.00      0.00         2
         

In [75]:
TOP_PREDICT = 3

# Create a dictionary for quick keyword lookup
keywords_dict = {}
for _, row in keywords_df.iterrows():
    if row['intent'] not in keywords_dict:
        keywords_dict[row['intent']] = set()
    keywords_dict[row['intent']].add(row['keyword'])

# Function to get top N predicted intents based on keywords
def get_top_n_intents(text, keywords_dict, n=1):
    text = preprocess_text(text)
    intent_scores = {}
    keyword_used = {}
    for intent, keywords in keywords_dict.items():
        score = 0
        for keyword in keywords:
            if keyword in text:
                score += 1
                keyword_used[intent] = keyword
        if score > 0:
            intent_scores[intent] = score
    sorted_intents = sorted(intent_scores, key=intent_scores.get, reverse=True)
    return sorted_intents[:n], [keyword_used[intent] for intent in sorted_intents[:n]]

# Predict the top N intents for the test data
test_data['top_predict_intents'], test_data['keywords_used'] = zip(*test_data['utterance'].apply(lambda x: get_top_n_intents(x, keywords_dict, n=TOP_PREDICT)))

# Check if the actual intent is in the top N predicted intents
test_data['is_accurate'] = test_data.apply(lambda row: row['intent'] in row['top_predict_intents'], axis=1)

# Calculate the accuracy score
accuracy_score = test_data['is_accurate'].mean()

# Create the final table with all the required fields
final_table = test_data[['utterance', 'category', 'intent', 'top_predict_intents', 'keywords_used', 'is_accurate']]

# Display the final table and accuracy score
display(final_table)
print(f"Accuracy Score: {accuracy_score * 100:.2f}%")


Unnamed: 0,utterance,category,intent,top_predict_intents,keywords_used,is_accurate
4848,"i wanna view the cancellation penalty, what sh...",CANCELLATION_FEE,check_cancellation_fee,"[check_cancellation_fee, get_invoice, delivery...","[cancellation, wanna, view]",True
7387,can I speak with a human?,CONTACT,contact_human_agent,[contact_human_agent],[speak],True
16039,I have a problem when trying to pay for my on...,PAYMENT,payment_issue,"[payment_issue, registration_problems, create_...","[pay, problem, online]",True
5559,i want to know what the number of client servi...,CONTACT,contact_customer_service,"[contact_customer_service, get_invoice, track_...","[number, want, want]",True
111,can you tell me if I could create two user acc...,ACCOUNT,create_account,"[create_account, switch_account, contact_custo...","[email, account, email]",True
...,...,...,...,...,...,...
12255,I wanna request information about seeing some ...,INVOICES,get_invoice,"[get_invoice, check_invoices, delivery_options]","[bill, bill, information]",True
16583,"I have an problem when trying to pay, what can...",PAYMENT,payment_issue,"[payment_issue, registration_problems, check_p...","[pay, problem, pay]",True
5638,can u show me information about the email of t...,CONTACT,contact_customer_service,"[contact_customer_service, get_refund, create_...","[email, show, email]",True
20246,can u ask an agent to request a reimbursement?,REFUNDS,get_refund,"[get_invoice, review, track_refund]","[ask, ask, ask]",False


Accuracy Score: 96.40%


In [76]:
# Calculate the accuracy rate for each intent
intent_accuracy = test_data.groupby('intent')['is_accurate'].mean().sort_values(ascending=False)

# Convert the Series to a DataFrame for better display
intent_accuracy_df = intent_accuracy.reset_index()
intent_accuracy_df.columns = ['Intent', 'Accuracy Rate']

# Display the table
display(intent_accuracy_df)


Unnamed: 0,Intent,Accuracy Rate
0,cancel_order,1.0
1,change_shipping_address,1.0
2,delete_account,1.0
3,delivery_options,1.0
4,review,1.0
5,check_cancellation_fee,0.996416
6,newsletter_subscription,0.994737
7,get_invoice,0.991424
8,switch_account,0.990826
9,contact_customer_service,0.989247
