In [85]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [86]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import string
import re

path = 'data/all_tickers.csv'
tickers = pd.read_csv(path,header=None)

path = 'data/twt_sample.csv'
df = pd.read_csv(path,header=None,names=['created_at','text', 'label'])
df['label'] = df.label.map({'positive':1,'negative':0})
df = df.drop(['created_at'],axis=1)

In [87]:
def processText(text):
    nltk_stops = stopwords.words('english')
    avoid_words = set(['URL','user'] + 
                  list(string.punctuation)).union(nltk_stops)
    lemma = WordNetLemmatizer()
    x = re.sub("\d+|[^a-zA-Z0-9]"," ",text)
    return ' '.join([lemma.lemmatize(word.lower()) 
                     for word in x.split() 
                         if word not in set(tickers[0].tolist())
                         if word not in set(avoid_words)
                    ])

df['text'] = df['text'].apply(processText)
df = df.drop_duplicates('text')
df = df[df['text'].str.split().str.len() > 3]

In [88]:
# split the X data(text) and y data(label)
X, y = df['text'], df['label']

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [90]:
# pipeline building a text classifier, way easier!
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(fit_prior=False)) ])

In [91]:
# grid search classifier building, no parameters yet!
from sklearn.grid_search import GridSearchCV

parameters={'vect__ngram_range': [(1, 1), (1, 3)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': (1e-2, 1e-3)}
gsClassifier = GridSearchCV(pipeline, parameters, n_jobs=-1)

In [92]:
# fit the training data to the vectorizer and model!
gsClassifier.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...near_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 3)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [93]:
# machine prediction results for X_test
y_pred_class = gsClassifier.predict(X_test)
print(gsClassifier.best_score_)
print(gsClassifier.best_params_)

0.7387158296249206
{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 3)}


In [94]:
# calculate accuracy of class predictions
from sklearn import metrics

print ('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))
print ('Conf Matrx:', metrics.confusion_matrix(y_test, y_pred_class))
print ('Clsf Reprt:', metrics.classification_report(y_test, y_pred_class))

Accuracy: 0.7331639135959339
Conf Matrx: [[135 103]
 [107 442]]
Clsf Reprt:              precision    recall  f1-score   support

        0.0       0.56      0.57      0.56       238
        1.0       0.81      0.81      0.81       549

avg / total       0.73      0.73      0.73       787

