#### Naives Bayes with Twitter customer support text
- author_id consists of both company and non-company. Non-company are labelled as numerical id

In [203]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [204]:
twitter = pd.read_csv('/Users/gracechongzuting/Desktop/twitter.csv')

In [205]:
twitter.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [206]:
twitter = twitter[['tweet_id','author_id','inbound','created_at','text']]
twitter.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...


In [207]:
twitter.isnull().sum()

tweet_id      0
author_id     0
inbound       0
created_at    0
text          0
dtype: int64

##### let's use Naives Bayes to predict whether the tweet is "inbound" sent to a company doing customer support on Twitter.

#### Step 1: Engineer feature 'True' and 'False' as 0 and 1 , where '0' indicates 'True'.

In [208]:
twitter["inbound"] = twitter["inbound"].astype(int)

twitter['inbound'].value_counts()


1    49
0    44
Name: inbound, dtype: int64

In [209]:
twitter.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text
0,119237,105834,1,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...
1,119238,ChaseSupport,0,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...
2,119239,105835,1,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...
3,119240,VirginTrains,0,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...
4,119241,105836,1,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...


#### Step 2: split our features into X matrix independent variable and y vector dependent variable. 

In [210]:
X = twitter[['text']]
y = twitter['inbound']

#### Step 3: train_test_split on our selected features to train and test sets.

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.35,
                                                    random_state=42,
                                                    stratify=y)

#### Step 3: turn our text into Features using CountVectorizer

In [212]:
countvector = CountVectorizer(max_features= 500, stop_words='english')

In [219]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_countvector = pd.DataFrame(countvector.fit_transform(X_train['text']).todense(),
                            columns=countvector.get_feature_names_out())
# Transform our testing data with the already-fit CountVectorizer.
X_test_countvector = pd.DataFrame(countvector.transform(X_test['text']).todense(),
                            columns=countvector.get_feature_names_out())




#### Step 5: Fit Naives Bayes model

In [214]:
nb = MultinomialNB()

model = nb.fit(X_train_countvector, y_train)
predictions = model.predict(X_test_countvector)
print(model.score(X_test_countvector,y_test))



0.8484848484848485


#### Step 6: import Confusion matrix

In [215]:
confusion_matrix(y_test, predictions)

array([[16,  0],
       [ 5, 12]])

In [216]:
tn, fp, fn, tp = confusion_matrix(y_test,predictions).ravel()


In [217]:
print("True Negative : %s "%tn)
print("False Positive : %s "%fp)
print("False Negative : %s "%fn)
print("True Positive : %s "%tp)

True Negative : 16 
False Positive : 0 
False Negative : 5 
True Positive : 12 
