# Customer Support on Twitter

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df = pd.read_csv(r"C:\Users\HP\Desktop\Vikrant\twitter_support_balanced_1L.csv")
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,2922414,AskLyft,False,Wed Nov 29 01:32:40 +0000 2017,@808773 To be immediately connected with our C...,,2922415.0
1,652595,ArgosHelpers,False,Thu Nov 02 11:14:30 +0000 2017,"@145112 Hi Marlon, would you like to DM us wit...",652596.0,652597.0
2,1845261,120576,True,Mon Nov 13 15:36:07 +0000 2017,"@444383 @VirginTrains Apologies Mitch, our ser...",1845262.0,1845260.0
3,955948,Ask_Spectrum,False,Sat Oct 21 21:40:41 +0000 2017,@343058 Your services are back up and working?...,955947.0,955949.0
4,27403,AmazonHelp,False,Wed Nov 01 10:17:00 +0000 2017,@122007 Hi Daniel. Please use this link to con...,,27404.0


In [4]:
df.columns

Index(['tweet_id', 'author_id', 'inbound', 'created_at', 'text',
       'response_tweet_id', 'in_response_to_tweet_id'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tweet_id                 100000 non-null  int64  
 1   author_id                100000 non-null  object 
 2   inbound                  100000 non-null  bool   
 3   created_at               100000 non-null  object 
 4   text                     100000 non-null  object 
 5   response_tweet_id        60833 non-null   object 
 6   in_response_to_tweet_id  74104 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 4.7+ MB


In [6]:
df = df[['text', 'inbound']]

In [7]:
df.dropna(subset=["text"], inplace=True)

In [None]:
df["text"] = df["text"].astype(str)           # Convert Text to String

In [15]:
df["text"] = df["text"].str.lower()     # Convert Text to Lowercase

In [16]:
df["text"] = df["text"].apply((lambda x:re.sub(r'http\S+|www\S+', '', x)))    # Remove URLs

In [17]:
df["text"] = df["text"].apply((lambda x: re.sub(r'@\w+', '', x)))  # Remove Twitter Mentions (@username)

In [18]:
df["text"] = df["text"].apply(lambda x:re.sub(r'#', '', x))   # Remove #

In [19]:
df["text"] = df["text"].apply(lambda x: re.sub(r"\d+", "", x))   # remove numbers

In [20]:
df["text"] = df["text"].apply(lambda x:re.sub(r'[^a-zA-Z\s]', '', x))  # Remove Spectial characters

In [21]:
df["text"] = df["text"].apply(lambda x: re.sub(r'\s+', ' ', x).strip()) # Remove Extra Whitespaces

In [22]:
df = df[df["text"] != ""] # Remove Empty Text Rows

In [27]:
df.head()

Unnamed: 0,text,inbound
0,to be immediately connected with our critical ...,False
1,hi marlon would you like to dm us with more in...,False
2,apologies mitch our services can get very busy...,True
3,your services are back up and working jh,False
4,hi daniel please use this link to contact us a...,False


In [26]:
x = df["text"]
y = df["inbound"].map({True: 1, False: 0})

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [30]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

x_train_tfidf = tfidf.fit_transform(x_train)

x_test_tfidf = tfidf.transform(x_test)

In [35]:
tokenizer = Tokenizer(num_words=20000, oov_token= "<OOV>")

tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)

x_test_seq = tokenizer.texts_to_sequences(x_test)

In [37]:
max_length = 50 # tweet length (safe value)

x_train_pad = pad_sequences(x_train_seq, maxlen=max_length, padding="post", truncating="post")

x_test_pad = pad_sequences(x_test_seq, maxlen=max_length, padding="post", truncating="post")

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

lo = LogisticRegression(max_iter=1000, n_jobs=-1)

lo.fit(x_train_tfidf, y_train)

loy_pred = lo.predict(x_test_tfidf)

print("AUC_ROC_SCORE: ", metrics.roc_auc_score(y_test, loy_pred))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, loy_pred))
print("Classification Report: \n", metrics.classification_report(y_test, loy_pred))

AUC_ROC_SCORE:  0.9447623820120208
Confusion matrix: 
 [[9366  612]
 [ 483 9346]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.94      0.94      9978
           1       0.94      0.95      0.94      9829

    accuracy                           0.94     19807
   macro avg       0.94      0.94      0.94     19807
weighted avg       0.94      0.94      0.94     19807



# Deep Learning - RNN

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [41]:
vocab_size = 20000
embedding_dim = 128
max_length = 50

rnn_model = Sequential()

rnn_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length = max_length))

rnn_model.add(SimpleRNN(units=64, return_sequences=False))

rnn_model.add(Dropout(0.5))

rnn_model.add(Dense(1, activation="sigmoid"))



In [42]:
rnn_model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

rnn_model.summary()

In [None]:
hist = rnn_model.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 28ms/step - accuracy: 0.8178 - loss: 0.4239 - val_accuracy: 0.8659 - val_loss: 0.3445
Epoch 2/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 28ms/step - accuracy: 0.8834 - loss: 0.3081 - val_accuracy: 0.8954 - val_loss: 0.2850
Epoch 3/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 31ms/step - accuracy: 0.9195 - loss: 0.2316 - val_accuracy: 0.9090 - val_loss: 0.2442
Epoch 4/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 34ms/step - accuracy: 0.8213 - loss: 0.4253 - val_accuracy: 0.8295 - val_loss: 0.4026
Epoch 5/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 34ms/step - accuracy: 0.9129 - loss: 0.2500 - val_accuracy: 0.9161 - val_loss: 0.2298
[1m619/619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9185 - loss: 0.2288
Test Accuracy: 0.9185136556625366


In [44]:
y_prob = rnn_model.predict(x_test_pad)

y_pred = (y_prob > 0.5).astype(int)

print(metrics.roc_auc_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[1m619/619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step
0.9184938526167683
[[9191  787]
 [ 827 9002]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      9978
           1       0.92      0.92      0.92      9829

    accuracy                           0.92     19807
   macro avg       0.92      0.92      0.92     19807
weighted avg       0.92      0.92      0.92     19807



# Deep Learning - LSTM

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam

In [57]:
vvocab_size = 20000
embedding_dim = 128
max_length = 50

lstm_model = Sequential()

lstm_model.add(Embedding(input_dim=vvocab_size, output_dim=embedding_dim, input_length = max_length))

lstm_model.add(LSTM(units=64, return_sequences=False))

lstm_model.add(Dropout(0.5))

lstm_model.add(Dense(1, activation="sigmoid"))



In [58]:
lstm_model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

lstm_model.summary()

In [59]:
hist_lstm = lstm_model.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/5


[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 50ms/step - accuracy: 0.7471 - loss: 0.5494 - val_accuracy: 0.7914 - val_loss: 0.5117
Epoch 2/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 50ms/step - accuracy: 0.8258 - loss: 0.4367 - val_accuracy: 0.8879 - val_loss: 0.3013
Epoch 3/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 44ms/step - accuracy: 0.9361 - loss: 0.1755 - val_accuracy: 0.9481 - val_loss: 0.1459
Epoch 4/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 35ms/step - accuracy: 0.9683 - loss: 0.0989 - val_accuracy: 0.9484 - val_loss: 0.1505
Epoch 5/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 34ms/step - accuracy: 0.9775 - loss: 0.0720 - val_accuracy: 0.9520 - val_loss: 0.1562


In [60]:
y_prob_lstm = lstm_model.predict(x_test_pad)

y_pred_lstm = (y_prob_lstm > 0.5).astype(int)

print(metrics.roc_auc_score(y_test, y_pred_lstm))
print(metrics.confusion_matrix(y_test, y_pred_lstm))
print(metrics.classification_report(y_test, y_pred_lstm))

[1m619/619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
0.9485403751515111
[[9482  496]
 [ 523 9306]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      9978
           1       0.95      0.95      0.95      9829

    accuracy                           0.95     19807
   macro avg       0.95      0.95      0.95     19807
weighted avg       0.95      0.95      0.95     19807



# Deep Learning - GRU

In [62]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam

In [65]:
gvocab_size = 20000
embedding_dim = 128
max_length = 50

gru_model = Sequential()

gru_model.add(Embedding(input_dim=gvocab_size, output_dim=embedding_dim, input_length= max_length))

gru_model.add(GRU(units=64, return_sequences=False))

gru_model.add(Dropout(0.5))

gru_model.add(Dense(1, activation="sigmoid"))

gru_model.compile(loss="binary_crossentropy", optimizer= Adam(learning_rate=0.001), metrics = ["accuracy"])




In [66]:
gru_hist = gru_model.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 38ms/step - accuracy: 0.7203 - loss: 0.4457 - val_accuracy: 0.9484 - val_loss: 0.1442
Epoch 2/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 36ms/step - accuracy: 0.9650 - loss: 0.1023 - val_accuracy: 0.9587 - val_loss: 0.1133
Epoch 3/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 37ms/step - accuracy: 0.9805 - loss: 0.0605 - val_accuracy: 0.9561 - val_loss: 0.1318
Epoch 4/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 38ms/step - accuracy: 0.9873 - loss: 0.0398 - val_accuracy: 0.9555 - val_loss: 0.1457
Epoch 5/5
[1m1981/1981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 37ms/step - accuracy: 0.9911 - loss: 0.0282 - val_accuracy: 0.9566 - val_loss: 0.1564


In [67]:
y_prob_gru = gru_model.predict(x_test_pad)

y_pred_gru = (y_prob_gru > 0.5).astype(int)

print(metrics.roc_auc_score(y_test, y_pred_gru))
print(metrics.confusion_matrix(y_test, y_pred_gru))
print(metrics.classification_report(y_test, y_pred_gru))

[1m619/619[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step
0.9560113845739904
[[9558  420]
 [ 451 9378]]
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      9978
           1       0.96      0.95      0.96      9829

    accuracy                           0.96     19807
   macro avg       0.96      0.96      0.96     19807
weighted avg       0.96      0.96      0.96     19807



# Validation

In [76]:
df2 = pd.read_csv(r"C:\Users\HP\Desktop\Vikrant\twitter_25k.csv")
df2.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,2340825,677118,True,Sun Nov 12 20:14:58 +0000 2017,"@NortonSupport With repect Steffi, unless you ...",2340824.0,2340826.0
1,2045224,AppleSupport,False,Wed Oct 04 18:30:00 +0000 2017,@604835 Your data is important! Let us know wh...,,2045225.0
2,292170,185589,True,Sun Oct 08 01:44:54 +0000 2017,@AskeBay also @AskeBay when I talked to someon...,,292167.0
3,524138,ATVIAssist,False,Sat Dec 02 05:07:14 +0000 2017,"@241175 Apologies for the delay, things should...",524139.0,524140.0
4,2410783,XboxSupport,False,Sun Nov 26 17:48:59 +0000 2017,@681385 We don't have any info on suspensions ...,,2410784.0


In [77]:
def clean_text(text):
    """
    Cleans Twitter text for NLP / DL models
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()                          # Convert to string & lowercase
    text = re.sub(r'http\S+|www\S+', '', text)        # Remove URLs
    text = re.sub(r'@\w+', '', text)                  # Remove mentions
    text = re.sub(r'#', '', text)                     # Remove hashtag symbol
    text = re.sub(r'\d+', '', text)                   # Remove numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)           # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()          # Remove extra spaces

    return text


In [78]:
df2 = df2[['text', 'inbound']]
df2.dropna(subset=["text"], inplace=True)

df2["text"] = df2["text"].apply(clean_text)

df2 = df2[df2["text"] != ""]


In [79]:
df2.head()

Unnamed: 0,text,inbound
0,with repect steffi unless you are on the board...,True
1,your data is important let us know which versi...,False
2,also when i talked to someone over fb they sai...,True
3,apologies for the delay things should be clear...,False
4,we dont have any info on suspensions here im a...,False


In [81]:
x_val = df2["text"]
y_val = df2["inbound"].map({True: 1, False: 0})


In [82]:
x_val_tfidf = tfidf.transform(x_val)

In [83]:
x_val_seq = tokenizer.texts_to_sequences(x_val)

In [84]:
max_length = 50

x_val_pad = pad_sequences(
    x_val_seq,
    maxlen=max_length,
    padding="post",
    truncating="post"
)

In [88]:
print("---------- Logistic Model ----------")
y_prob_lo_val = lo.predict(x_val_tfidf)

y_pred_lo_val = (y_prob_lo_val > 0.5).astype(int)

print("AUC_ROC_SCORE: ", metrics.roc_auc_score(y_val, y_pred_lo_val))
print("Confusion Matrix: \n",metrics.confusion_matrix(y_val, y_pred_lo_val))
print("Classification Report: \n",metrics.classification_report(y_val, y_pred_lo_val))

---------- Logistic Model ----------
AUC_ROC_SCORE:  0.9530201503089759
Confusion Matrix: 
 [[11805   662]
 [  502 11784]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.95      0.95     12467
           1       0.95      0.96      0.95     12286

    accuracy                           0.95     24753
   macro avg       0.95      0.95      0.95     24753
weighted avg       0.95      0.95      0.95     24753



In [90]:
print("---------- RNN Model ----------")
y_prob_rnn_val = rnn_model.predict(x_val_pad)

y_pred_rnn_val = (y_prob_rnn_val > 0.5).astype(int)

print("AUC_ROC_SCORE: ", metrics.roc_auc_score(y_val, y_pred_rnn_val))
print("Confusion Matrix: \n",metrics.confusion_matrix(y_val, y_pred_rnn_val))
print("Classification Report: \n",metrics.classification_report(y_val, y_pred_rnn_val))

---------- RNN Model ----------
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step
AUC_ROC_SCORE:  0.9372918589399635
Confusion Matrix: 
 [[11700   767]
 [  785 11501]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     12467
           1       0.94      0.94      0.94     12286

    accuracy                           0.94     24753
   macro avg       0.94      0.94      0.94     24753
weighted avg       0.94      0.94      0.94     24753



In [None]:
print("---------- LSTM Model ----------")
y_prob_lstm_val = lstm_model.predict(x_val_pad)

y_pred_lstm_val = (y_prob_lstm_val > 0.5).astype(int)

print("AUC_ROC_SCORE: ", metrics.roc_auc_score(y_val, y_pred_lstm_val))
print("Confusion Matrix: \n",metrics.confusion_matrix(y_val, y_pred_lstm_val))
print("Classification Report: \n",metrics.classification_report(y_val, y_pred_lstm_val))

---------- LSTM Model ----------
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step
AUC_ROC_SCORE:  0.9729535069115102
Confusion Matrix: 
 [[12163   304]
 [  365 11921]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.97     12467
           1       0.98      0.97      0.97     12286

    accuracy                           0.97     24753
   macro avg       0.97      0.97      0.97     24753
weighted avg       0.97      0.97      0.97     24753



In [86]:
print("---------- GRU Model ----------")
y_prob_gru_val = gru_model.predict(x_val_pad)

y_pred_gru_val = (y_prob_gru_val > 0.5).astype(int)

print("AUC_ROC_SCORE: ", metrics.roc_auc_score(y_val, y_pred_gru_val))
print("Confusion Matrix: \n",metrics.confusion_matrix(y_val, y_pred_gru_val))
print("Classification Report: \n",metrics.classification_report(y_val, y_pred_gru_val))

---------- GRU Model ----------
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step
AUC_ROC_SCORE:  0.9807913598394961
Confusion Matrix: 
 [[12260   207]
 [  268 12018]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98     12467
           1       0.98      0.98      0.98     12286

    accuracy                           0.98     24753
   macro avg       0.98      0.98      0.98     24753
weighted avg       0.98      0.98      0.98     24753



In [91]:
gru_model.save("gru_model.h5")

import pickle
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))





In [92]:
gru_model.save("gru_model.h5")


