In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # regex
import operator
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stocknews/upload_DJIA_table.csv
/kaggle/input/stocknews/Combined_News_DJIA.csv
/kaggle/input/stocknews/RedditNews.csv


# comments
Some words like "inflation" will have higher weight than others

Binary

In [5]:
# https://stackabuse.com/text-classification-with-python-and-scikit-learn/
    
djia = pd.read_csv("/kaggle/input/stocknews/upload_DJIA_table.csv")
combined = pd.read_csv("/kaggle/input/stocknews/Combined_News_DJIA.csv")

# (1) EDA

In [6]:
print(djia.head())
print(combined.head())

         Date          Open          High           Low         Close  \
0  2016-07-01  17924.240234  18002.380859  17916.910156  17949.369141   
1  2016-06-30  17712.759766  17930.609375  17711.800781  17929.990234   
2  2016-06-29  17456.019531  17704.509766  17456.019531  17694.679688   
3  2016-06-28  17190.509766  17409.720703  17190.509766  17409.720703   
4  2016-06-27  17355.210938  17355.210938  17063.080078  17140.240234   

      Volume     Adj Close  
0   82160000  17949.369141  
1  133030000  17929.990234  
2  106380000  17694.679688  
3  112190000  17409.720703  
4  138740000  17140.240234  
         Date  Label                                               Top1  \
0  2008-08-08      0  b"Georgia 'downs two Russian warplanes' as cou...   
1  2008-08-11      1  b'Why wont America and Nato help us? If they w...   
2  2008-08-12      0  b'Remember that adorable 9-year-old who sang a...   
3  2008-08-13      0  b' U.S. refuses Israel weapons to attack Iran:...   
4  2008-08-1

In [None]:
print(djia.info())
print(combined.info())

In [10]:
print(djia.isnull().sum())
print(combined.isnull().sum())

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64
Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64


In [None]:
# combine djia and combined
djia.Date = pd.to_datetime(djia.Date)
combined.Date = pd.to_datetime(combined.Date)

final = pd.merge(djia, combined, how='inner', on='Date')
final.sort_values(by=["Date"], ascending=True)
final.dropna(axis=0, how='any')
final = final.head(1000)

regex = r'\w+'

filtered_all_years = []

for i, d in final.iterrows():
    filtered_all_titles = []
    for title in d[8:]:
        filtered_words = []
        # take all words in title, keep only the ones that are noun, adj, adv, or verb
        # consider normalization!!! lemmatization
        words = nltk.word_tokenize(title)
        words = [nltk.stem.WordNetLemmatizer().lemmatize(word) for word in words]
        words_tags = nltk.pos_tag(words)
        for word_tag in words_tags:
            if word_tag[1][0] in ["N", "J", "R", "V"] and len(word_tag[0]) != 1: #if noun, adj, adv, or verb ; not one letter
                filtered_words.append(word_tag[0])
        regex = re.compile('\w+')
        filtered_words = list(filter(regex.match, filtered_words))
        filtered_words = [re.sub(r'[^a-zA-Z0-9]', '', i).lower() for i in filtered_words]
        filtered_all_titles = filtered_all_titles + filtered_words
    filtered_all_titles = ' '.join(filtered_all_titles)
    filtered_all_years.append(filtered_all_titles)
#use nltk library to collect words and then store them in dict instead of using regex

In [None]:
# bag of words model

tfidfconverter = TfidfTransformer()
vectorizer = CountVectorizer(max_features=500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(filtered_all_years).toarray()
X = tfidfconverter.fit_transform(X).toarray()

In [None]:
# train test split

y = final['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = RandomForestClassifier(n_estimators=500, random_state=42)
classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test)
print(y_pred, y_test)

In [None]:
# evaluation

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))