In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

print("Loading and preparing the dataset...")


df = pd.read_csv('twcs.csv', nrows=50000)


df_customer = df[df['inbound']==True].copy()
df_customer.dropna(subset=['text'], inplace=True)
df_customer['cleaned_text'] = df_customer['text'].apply(lambda x: re.sub(r'@\w+', '', x.lower())) # Remove mentions and lowercase
df_customer['cleaned_text'] = df_customer['cleaned_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x)) # Remove special chars


vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=5)
X = vectorizer.fit_transform(df_customer['cleaned_text'])


num_topics = 7
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        print(message)

print("\n--- Discovered Topics from Customer Tweets ---")
display_topics(lda, vectorizer.get_feature_names_out(), 10)

Loading and preparing the dataset...

--- Discovered Topics from Customer Tweets ---
Topic #0: amazon order delivery service days ive customer day received prime
Topic #1: thank ticket que train good today la morning seat trains
Topic #2: help flight number card need like thanks make im account
Topic #3: phone update fix iphone internet ios working im problem issue
Topic #4: just im got yes going waiting halloween minutes long time
Topic #5: thanks service customer time just love got dont im amp
Topic #6: just sent account dm email app know help tried message


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

topic_labels = lda.transform(X).argmax(axis=1)
df_customer['topic'] = topic_labels

X_text = df_customer['cleaned_text']
y_topic = df_customer['topic']

tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

X_tfidf = tfidf_vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_topic, test_size=0.2, random_state=42)

classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

print("--- Topic classification model has been trained. ---\n")

predictions = classifier.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))

print("\n--- Testing the model with new, unseen text ---")
new_complaint = "my package is late and the delivery date keeps changing"
new_complaint_vec = tfidf_vectorizer.transform([new_complaint])
predicted_topic = classifier.predict(new_complaint_vec)

print(f"Complaint: '{new_complaint}'")
print(f"Predicted Topic #: {predicted_topic[0]}")

--- Topic classification model has been trained. ---

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.80      0.74      1091
           1       0.76      0.59      0.66       459
           2       0.68      0.65      0.66       744
           3       0.74      0.77      0.75       774
           4       0.70      0.70      0.70       771
           5       0.70      0.69      0.69       852
           6       0.74      0.68      0.71       792

    accuracy                           0.71      5483
   macro avg       0.71      0.70      0.70      5483
weighted avg       0.71      0.71      0.71      5483


--- Testing the model with new, unseen text ---
Complaint: 'my package is late and the delivery date keeps changing'
Predicted Topic #: 0


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
