# Stock Market Predictor using train_test_split, Logistic Regression, KFolds cross-validation and Naive Bayes
I created a kernel that used random sampling to predict the `'Label'` rather than creating datasets from specified dates ranges

In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB

path = '../input/Combined_News_DJIA.csv'
data = pd.read_csv(path)
data.head()

In [None]:
# Create feature matrix (X) and the response vector (y)
X = data.iloc[:, 2:27]
y = data.Label

In [None]:
# train_test_split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.19, random_state=13)

In [None]:
# the length of y_test is the same as testing data suggested by the dataset's creator
len(y_test)

In [None]:
# combine all the headlines into a string per each row and add them to trainheadlines
trainheadlines = []
for row in range(0, len(X_train.index)):
    trainheadlines.append(' '.join(str(x) for x in X_train.iloc[row, 0:25]))

In [None]:
# instantiate and fit the CountVectorizer
vect = CountVectorizer()
vect_train = vect.fit_transform(trainheadlines)
print(vect_train.shape)

In [None]:
# instantiate and fit the LogisticRegression model
logreg = LogisticRegression()
logreg.fit(vect_train, y_train)

In [None]:
# follow the same steps for the testing data as the training data
testheadlines = []
for row in range(0, len(X_test.index)):
    testheadlines.append(' '.join(str(x) for x in X_test.iloc[row, 0:25]))
vect_test = vect.transform(testheadlines)
predictions = logreg.predict(vect_test)

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, predictions)

In [None]:
# Use crosstab to look at the results
pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'])

In [None]:
# create vector transforms of X
headlines = []
for x in range(0, len(X.index)):
    headlines.append(' '.join(str(x) for x in X.iloc[row, 0:25]))
vect_headlines = vect.transform(headlines)

In [None]:
# split the dataset using K-folds with shuffle=False
# calculate the cross_val_scores
kf = KFold(len(y), n_folds=10, shuffle=False)
print(cross_val_score(logreg, vect_headlines, y, cv=kf))
print(cross_val_score(logreg, vect_headlines, y, cv=kf).mean())

In [None]:
# split the dataset using K-folds with shuffle=True
kf = KFold(len(y), n_folds=10, shuffle=True)
print(cross_val_score(logreg, vect_headlines, y, cv=kf))
print(cross_val_score(logreg, vect_headlines, y, cv=kf).mean())

In [None]:
# Building and evaluating a model
# import and instantiate and fit a Multinominal Naive Bayes model
nb = MultinomialNB(alpha=1.0)
%time nb.fit(vect_train, y_train)

In [None]:
# make class predictions for vect_test
predictions = nb.predict(vect_test)

In [None]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, predictions)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, predictions)