In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In this Notebook I want to explore the sentiments of top 20 headlines as features to predict the stockmarket. First action to read the file and save it in a Pandas dataframe.

In [2]:
df_news = pd.read_csv('../input/Combined_News_DJIA.csv')

After reading the .csv file, I use "textblob" to assign sentiment to each of the headlines on each row. (I took the functions from one stackoverflow answer. Not rocket science but why invent the wheels?)

In [3]:
from textblob import TextBlob
import re

def clean_headline(headline):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", headline).split())

def analize_sentiment(headline):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(clean_headline(headline))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [4]:
topn_headlines = 20
top_num = range(1,topn_headlines+1)
dataframe_col = 'sum_sent' + str(max(top_num))
df_news[dataframe_col] = 0
for ii in top_num:
    read_col = 'Top' + str(ii)
    write_col = 'Top' + str(ii) + '_sent'
    df_news[write_col] = np.array([ analize_sentiment(headline) for headline in df_news[read_col] ])
    df_news[dataframe_col] = df_news[dataframe_col] + df_news[write_col]
df_news

Now we have the sentiment of all the headlines as columns in each row. The next step is to use the sentiments of the headlineas as a vector of length 20 as features and the labels to train a classifier.

In [5]:
headlines_columns = range(1,21)
X_data_list = []
for rows in range(len(df_news)):
    X_dataTemp = []
    for i in range(1,21):
        X_dataTemp.append(df_news.ix[rows,'Top'+str(i)+'_sent'])
    X_data_list.append(X_dataTemp)
X_data = np.array(X_data_list)
y_label = df_news['Label'] 

In [6]:
# break the dataset to to 65% training and 35% test.
train_sample_perc = .65
n_samples = len(y_label)
X_train = X_data_list[:int(train_sample_perc * n_samples)]
y_train = y_label[:int(train_sample_perc * n_samples)]
X_test = X_data_list[int(train_sample_perc * n_samples):]
y_test = y_label[int(train_sample_perc * n_samples):]

Now let's throw bunch of classifiers at the dataset in the voting classifier mode and see can we get out of it! (not scientific but proves that probably blind use of sklearn won't get us far!)

In [7]:
from sklearn import datasets, neighbors, linear_model
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score,accuracy_score,confusion_matrix,classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
logistic = LogisticRegression(random_state=1)
rf = RandomForestClassifier(random_state=1)
gnb = GaussianNB()
svm = SVC(random_state=1,probability=True)

In [8]:
eclf1 = VotingClassifier(estimators=[
...         ('lr', logistic),('gnb',gnb),('svm',svm)], voting='soft', weights=[1,1,1],
...        flatten_transform=True)
eclf1 = eclf1.fit(X_train, y_train)
y_predict_vote = eclf1.predict(X_test)
print  (classification_report(y_predict_vote,y_test))
print (accuracy_score(y_predict_vote,y_test))

The result is not good! Why?
1. The training and test data is highly imbalanced toward class1 --> to improve maybe we should down-sample the class 1
2. We can also devide the dataset based on quarter/season/begining of the year/end of the year and have four different model for each time period.
3. We can also augment the BOW model with sentiment vectors and examine the performance against pure BOW or sentiment vector alone.
4. Also, considering the delayed version of sentiment vectors as features (or even adding memory to the features ,e.g. lstm maybe??)