In [834]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [835]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import string
import re

path = 'data/all_tickers.csv'
tickers = pd.read_csv(path,header=None)

path = 'data/twt_sample2.csv'
df = pd.read_csv(path,header=None,names=['created_at','text', 'label'])
df['label'] = df.label.map({'positive':1,'negative':0})
df = df.drop(['created_at'],axis=1)

In [836]:
def processText(text):
    nltk_stops = stopwords.words('english')
    avoid_words = set(['URL','user'] + 
                  list(string.punctuation)).union(nltk_stops)
    lemma = WordNetLemmatizer()
    x = re.sub("\d+|[^a-zA-Z0-9]"," ",text)
    return ' '.join([lemma.lemmatize(word.lower()) 
                     for word in x.split() 
                         if word not in set(tickers[0].tolist())
                         if word not in set(avoid_words)
                    ])

df['text'] = df['text'].apply(processText)
df = df.drop_duplicates('text')
df = df[df['text'].str.split().str.len() > 2]

In [837]:
# split the X data(text) and y data(label)
X, y = df['text'], df['label']

In [838]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3)

In [839]:
# pipeline building a text classifier, way easier!
# tfid --- ,sublinear_tf=True
pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, max_features=None,
                             strip_accents='unicode', analyzer='word', 
                             token_pattern=r'\w{1,}', ngram_range=(1, 3), 
                             use_idf=1, smooth_idf=1, sublinear_tf=1,
                             stop_words='english')),
    ('clf', MultinomialNB()) ])

In [840]:
# grid search classifier building, no parameters yet!
from sklearn.grid_search import GridSearchCV

parameters={}
gsClassifier = GridSearchCV(pipeline, parameters, n_jobs=2)

In [841]:
# fit the training data to the vectorizer and model!
gsClassifier.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
     ...f=1,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=2, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [842]:
# machine prediction results for X_test
y_pred_class = gsClassifier.predict(X_test)

In [843]:
# calculate accuracy of class predictions
from sklearn import metrics

print ('Accuracy:', metrics.accuracy_score(y_test, y_pred_class))
print ('Conf Matrx:', metrics.confusion_matrix(y_test, y_pred_class))
print ('Clsf Reprt:', metrics.classification_report(y_test, y_pred_class))

Accuracy: 0.7314814814814815
Conf Matrx: [[20 16]
 [13 59]]
Clsf Reprt:              precision    recall  f1-score   support

        0.0       0.61      0.56      0.58        36
        1.0       0.79      0.82      0.80        72

avg / total       0.73      0.73      0.73       108

