In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import pandas as pd
import string
import re

path = 'data/all_tickers.csv'
tickers = pd.read_csv(path,header=None)

path = 'data/twt_sample.csv'
df = pd.read_csv(path,header=None,names=['created_at','text', 'label'])
df['label'] = df.label.map({'positive':1,'negative':0})
df = df.drop(['created_at'],axis=1)

def processText(text):
    user_words = ['URL','user'] + list(string.punctuation)
    nltk_stops = stopwords.words('english')
    stop_words = set(nltk_stops).union(user_words)
    x = re.sub("\d+|[^a-zA-Z0-9_]"," ",text.replace("'","").replace("_",''))
    x = ' '.join([word.lower() for word in x.split() 
                  if word not in set(tickers[0].tolist())
                  if word not in set(stop_words)])
    return x

df['text'] = df['text'].apply(processText)
df = df.drop_duplicates('text')
df = df[df['text'].str.split().str.len() > 2]

In [3]:
X = df.text
y = df.label

In [4]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# instantiate the vectorizer
# vect = CountVectorizer() # stop_words='english'
from sklearn.pipeline import Pipeline

text_classifier = Pipeline([('vect',CountVectorizer()),('tfidf', TfidfTransformer()),('clf',MultinomialNB())])

In [6]:
# fit and transform into a single step[]
X_train_dtm = vect.fit_transform(X_train)

In [7]:
# examine the document-term matrix
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1079x5220 sparse matrix of type '<class 'numpy.int64'>'
	with 7729 stored elements in Compressed Sparse Row format>

In [9]:
# import and instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [10]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 3.28 ms, sys: 1.63 ms, total: 4.9 ms
Wall time: 3.36 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [12]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7720111214087118

In [13]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[169, 157],
       [ 89, 664]])

In [14]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [15]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.8074858032084342