In [19]:
import pandas as pd

train_df = pd.read_csv("/kaggle/input/customer-support-on-twitter/twcs/twcs.csv")
test_df  = pd.read_csv("/kaggle/input/customer-support-on-twitter/sample.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (2811774, 7)
Test shape: (93, 7)


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [20]:
train_df = train_df[train_df["inbound"] == True].copy()
train_df = train_df.dropna(subset=["text"])

train_df.head()


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0
6,8,115712,True,Tue Oct 31 21:45:10 +0000 2017,@sprintcare is the worst customer service,9610.0,
8,12,115713,True,Tue Oct 31 22:04:47 +0000 2017,@sprintcare You gonna magically change your co...,111314.0,15.0


In [21]:
def label_text(text):
    text = text.lower()

    billing_words = ["charge", "charged", "payment", "invoice", "refund", "billing", "bill", "price", "subscription", "money", "paid"]
    technical_words = ["error", "issue", "problem", "not working", "bug", "crash", "failed", "slow", "down", "cannot access", "can't access"]
    account_words = ["login", "log in", "password", "account", "sign in", "reset", "locked", "username", "email", "verify"]

    if any(word in text for word in billing_words):
        return "Billing"
    elif any(word in text for word in technical_words):
        return "Technical"
    elif any(word in text for word in account_words):
        return "Account"
    else:
        return None


In [22]:
train_df["category"] = train_df["text"].apply(label_text)
train_labeled = train_df.dropna(subset=["category"])

print("Total inbound tweets:", len(train_df))
print("Labeled tweets:", len(train_labeled))

train_labeled["category"].value_counts()


Total inbound tweets: 1537843
Labeled tweets: 343986


category
Technical    152325
Billing      117157
Account       74504
Name: count, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split

X = train_labeled["text"]
y = train_labeled["category"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

print("Training finished!")


Training finished!


In [25]:
from sklearn.metrics import classification_report, accuracy_score

y_val_pred = model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))


Validation Accuracy: 0.969315968487456
              precision    recall  f1-score   support

     Account       0.94      0.98      0.96     14901
     Billing       0.99      0.95      0.97     23432
   Technical       0.97      0.97      0.97     30465

    accuracy                           0.97     68798
   macro avg       0.97      0.97      0.97     68798
weighted avg       0.97      0.97      0.97     68798



In [26]:
test_df = test_df.dropna(subset=["text"]).copy()

test_df["predicted_category"] = model.predict(test_df["text"])

test_df[["text", "predicted_category"]].head(20)


Unnamed: 0,text,predicted_category
0,@AppleSupport causing the reply to be disregar...,Technical
1,@105835 Your business means a lot to us. Pleas...,Technical
2,@76328 I really hope you all change but I'm su...,Billing
3,@105836 LiveChat is online at the moment - htt...,Technical
4,@VirginTrains see attached error message. I've...,Technical
5,"@105836 Have you tried from another device, Mi...",Technical
6,"@VirginTrains yep, I've tried laptop too sever...",Technical
7,"@105836 It's working OK from here, Miriam. Doe...",Technical
8,@VirginTrains I still haven't heard &amp; the ...,Technical
9,@105836 That's what we're here for Miriam ðŸ˜Š T...,Account


In [27]:
test_df[["tweet_id", "text", "predicted_category"]].to_csv("predictions.csv", index=False)
print("/kaggle/working/Saved predictions.csv")


/kaggle/working/Saved predictions.csv
