In [1]:
# Common imports
import numpy as np
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Additional tools
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import datetime
from datetime import datetime
from dateutil.relativedelta import relativedelta
from statistics import mean
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
import random
from IPython.display import clear_output

#NLTK
import nltk
nltk.download('vader_lexicon')
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math

# to make this notebook's output identical at every run
np.random.seed(42)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dustin.ellis/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Problem: Our goal is going to be to use the tweets and headlines to predict DayReturn. Classifiction problem is just positive or negative DayReturn, Regression problem is predicting the actual number.

### Get the data
#### We decided to use Twitter data and stock prediction headlines to predict day return. Twitter data news headlines were combined into a stock prediction dataset.


In [2]:
# Repeat this for the stock prediction datafile, which is being read from a pickle. 
sp = pd.read_pickle('stock_prediction_data2.p')
#Preview dataframe
#sp.head()

In [1]:
#Some early preprocessing

#lowercase Ticker, NewsHeadlineList, NewsSourceList, NewsSummaryList, & TweetList Columns
sp = sp.apply(lambda x: x.astype(str).str.lower())
sp.head()

NameError: name 'sp' is not defined

In [4]:
Ticker = sp['Ticker']

In [5]:
#Feature engineering to get number of tweets and number of news headslines
sp['NumTweets'] = sp['TweetList'].apply(len)
sp['NumHeadlines'] = sp['NewsHeadlineList'].apply(len)

In [2]:
sp.head()

In [7]:
#Cleaning and engineering time variables for machine learning algorithms.

In [3]:
sp['Date'] = pd.to_datetime(sp['Date'])
sp['Date'] = sp['Date'].dt.strftime('%d.%m.%Y')
sp['year'] = pd.DatetimeIndex(sp['Date']).year
sp['month'] = pd.DatetimeIndex(sp['Date']).month
sp['day'] = pd.DatetimeIndex(sp['Date']).day
sp['dayofyear'] = pd.DatetimeIndex(sp['Date']).dayofyear
sp['weekofyear'] = pd.DatetimeIndex(sp['Date']).weekofyear
sp['weekday'] = pd.DatetimeIndex(sp['Date']).weekday
sp['quarter'] = pd.DatetimeIndex(sp['Date']).quarter
sp['is_month_start'] = pd.DatetimeIndex(sp['Date']).is_month_start
sp['is_month_end'] = pd.DatetimeIndex(sp['Date']).is_month_end
print(sp.info())

NameError: name 'pd' is not defined

In [9]:
sp = sp.drop(['Date'], axis = 1) 

In [4]:
#dummy encoding of dates
sp = pd.get_dummies(sp, columns=['year'], drop_first=True, prefix='year')

sp = pd.get_dummies(sp, columns=['month'], drop_first=True, prefix='month')

sp = pd.get_dummies(sp, columns=['weekday'], drop_first=True, prefix='wday')
sp = pd.get_dummies(sp, columns=['quarter'], drop_first=True, prefix='qrtr')

sp = pd.get_dummies(sp, columns=['is_month_start'], drop_first=True, prefix='m_start')

sp = pd.get_dummies(sp, columns=['is_month_end'], drop_first=True, prefix='m_end')

sp.info()

NameError: name 'pd' is not defined

In [5]:
#Cleaning and lemmatizing the tweets, news headlines, and news summaries. 

#I didn’t think URLs would help with sentiment analysis so I wanted to remove them.
#a second regex was used to remove placeholders from removed links or videos
#a third regex was used to remove HTML reference characters
#a fourth was used to remove non-letter characters
# a fifth regex was used to remove mentions @

# Do for tweets
sp['clean_tweets'] = sp['TweetList'].apply(lambda x: re.sub(r"http\S+", "", x))
sp['clean_tweets'].apply(lambda x: re.sub(r'{link}', '', x))
sp['clean_tweets'].apply(lambda x: re.sub(r'&[a-z]+;', '', x))
sp['clean_tweets'].apply(lambda x: re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', x))
sp['clean_tweets'].apply(lambda x: re.sub(r'@mention', '', x))
sp['clean_tweets'][333]

NameError: name 'sp' is not defined

In [6]:
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the tweets, news headlines, and news summaries
sp['clean_tweets'].apply(lambda x:expand_contractions(x))

# Remove digits and words containing digits in the tweets, news headlines, and news summaries
sp['clean_tweets'].apply(lambda x: re.sub('\w*\d\w*','',x))

# Remove punctuations in the tweets, news headlines, and news summaries
sp['clean_tweets'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

# Removing extra spaces in the tweets, news headlines, and news summaries
sp['clean_tweets'].apply(lambda x: re.sub(' +',' ',x))

NameError: name 're' is not defined

In [13]:
pip install -U pip setuptools wheel

Note: you may need to restart the kernel to use updated packages.


In [14]:
#Installation of spaCy to access language analytical tools 
import spacy

In [15]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 13.7 MB/s eta 0:00:01
Note: you may need to restart the kernel to use updated packages.


In [16]:
# Loading model
nlp = spacy.load('en_core_web_lg',disable=['parser', 'ner'])
nlp.max_length = 1030000 # or even higher

In [17]:
# Lemmatization with stopwords removal in the tweets, news headlines, and news summaries
sp['lemmatized_tweets'] = sp['clean_tweets'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

In [18]:
#check for nulls
sp.isnull().sum()
#drop nulls
sp.dropna(inplace=True)
sp.isnull().sum()

Ticker                   0
NewsHeadlineList         0
NewsSourceList           0
NewsSummaryList          0
NewsUrlList              0
NewsHeadlineCountList    0
TweetList                0
TweetTimeList            0
Open                     0
High                     0
Low                      0
Close                    0
AdjClose                 0
Volume                   0
TomorrowClose            0
DayReturn                0
NumTweets                0
NumHeadlines             0
day                      0
dayofyear                0
weekofyear               0
month_2                  0
month_3                  0
month_4                  0
month_5                  0
month_6                  0
month_7                  0
month_8                  0
month_9                  0
month_10                 0
month_11                 0
month_12                 0
wday_1                   0
wday_2                   0
wday_3                   0
wday_4                   0
wday_5                   0
w

In [19]:
#Switching columns from object to integer
sp["Open"] = sp["Open"].astype(str).astype(float)
sp["Open"] = sp["Open"].astype(float).astype(int)

sp["High"] = sp["High"].astype(str).astype(float)
sp["High"] = sp["High"].astype(float).astype(int)

sp["Low"] = sp["Low"].astype(str).astype(float)
sp["Low"] = sp["Low"].astype(float).astype(int)

sp["Close"] = sp["Close"].astype(str).astype(float)
sp["Close"] = sp["Close"].astype(float).astype(int)

sp["AdjClose"] = sp["AdjClose"].astype(str).astype(float)
sp["AdjClose"] = sp["AdjClose"].astype(float).astype(int)

sp["Volume"] = sp["Volume"].astype(str).astype(float)
sp["Volume"] = sp["Volume"].astype(float).astype(int)

sp["TomorrowClose"] = sp["TomorrowClose"].astype(str).astype(float)

sp["DayReturn"] = sp["DayReturn"].astype(str).astype(float)

              Imaging: Product WordClouds to Visualize Data from Tweets, Summaries, and Headlines

In [20]:
#group Tweets according to Ticker
sp_grouped=sp[['Ticker','lemmatized_tweets']].groupby(by='Ticker').agg(lambda x:' '.join(x))
sp_grouped.head()

Unnamed: 0_level_0,lemmatized_tweets
Ticker,Unnamed: 1_level_1
amd,[ ' rt @eliteoptions2 : $ tsla - trade idea - ...
ge,[ ' rt @newlowobserver : @tlbenson17 @cvpayne ...
nvda,[ ' rt @eliteoptions2 : $ tsla - trade idea - ...
tsla,[ ' @teslaarmy @elonmusk congrats ! ! ! $ ts...


In [21]:
# Creating Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(sp_grouped['lemmatized_tweets'])
sp_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
sp_dtm.index=sp_grouped.index
sp_dtm.head(3)

Unnamed: 0_level_0,00,000,0000,00000005,000001,000003636,0000055,00001,00003,000041,...,𝟬𝟯,𝟬𝟰,𝟭𝟲,𝟮𝟬,𝟮𝟬𝟮𝟬,𝟮𝟭,𝟮𝟲,𝟯𝟱,𝟲𝟭,𝟳𝟮
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
amd,1710,1121,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
ge,792,885,0,0,0,0,0,0,0,0,...,0,1,3,0,0,0,1,1,0,1
nvda,1659,947,0,0,0,0,0,0,0,0,...,1,0,0,2,0,0,0,0,1,0


In [22]:
pip install wordcloud

Note: you may need to restart the kernel to use updated packages.


In [23]:
pip install -U textwrap3

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.title('\n'.join(wrap(title,60)),fontsize=13)
  plt.show()
    
    
  # Transposing document term matrix
sp_dtm=sp_dtm.transpose()

# Plotting word cloud for each product. Run if you would like to see these!
for index, Ticker in enumerate(sp_dtm.columns):
    generate_wordcloud(sp_dtm[Ticker].sort_values(ascending=False),Ticker)

NameError: name 'sp_dtm' is not defined

In [25]:
#Polarity
from textblob import TextBlob
sp['tweet_polarity']= sp['lemmatized_tweets'].apply(lambda x:TextBlob(x).sentiment.polarity)

In [26]:
#Finding tweets with buy or sell

In [27]:
lst = sp['lemmatized_tweets'][0]

In [28]:
lst.count('buy')

178

In [29]:
lst.count('sell')

110

In [30]:
#For tweets
sp['BuyCount_Tweets'] = sp['lemmatized_tweets'].apply(lambda x: x.count('buy'))
sp['SellCount_Tweets'] = sp['lemmatized_tweets'].apply(lambda x: x.count('sell'))

In [31]:
tweet = sp.drop(['NumHeadlines','NewsHeadlineList', 'NewsSourceList','NewsSummaryList','NewsUrlList',
'NewsHeadlineCountList', 'TweetList', 'TweetTimeList','clean_tweets'],axis = 1)

In [8]:
tweet.head()

In [9]:
#Remove emojis from dataframe column by column
tweet.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

In [34]:
#look for correlations in regards to tweet polarity for the numerical data
corr_matrix = tweet.corr()
corr_matrix["DayReturn"].sort_values(ascending=False)

DayReturn           1.000000
m_end_True          0.368586
wday_4              0.121774
month_4             0.106567
qrtr_2              0.087435
day                 0.085488
TomorrowClose       0.032846
Volume              0.012089
dayofyear           0.005775
tweet_polarity      0.002754
weekofyear         -0.000635
month_6            -0.007113
month_7            -0.007652
month_10           -0.007744
month_11           -0.010276
month_12           -0.010718
month_8            -0.010796
month_9            -0.011299
month_5            -0.012743
wday_6             -0.013441
wday_5             -0.013470
m_start_True       -0.015421
qrtr_4             -0.017378
qrtr_3             -0.018011
month_2            -0.022753
wday_3             -0.024942
wday_1             -0.025047
month_3            -0.026486
wday_2             -0.029939
SellCount_Tweets   -0.034979
BuyCount_Tweets    -0.042152
Open               -0.059134
High               -0.059141
Low                -0.059153
AdjClose      

In [35]:
import seaborn as sns

In [10]:
# Plotting correlation matrix between numeric variables
plt.figure(figsize = (50,50))
sns.heatmap(tweet.corr(method="pearson"), cmap='Blues', annot = True)

NameError: name 'plt' is not defined

In [38]:
tweet = tweet.dropna()

In [39]:
tweet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 333 to 116
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ticker             324 non-null    object 
 1   Open               324 non-null    int64  
 2   High               324 non-null    int64  
 3   Low                324 non-null    int64  
 4   Close              324 non-null    int64  
 5   AdjClose           324 non-null    int64  
 6   Volume             324 non-null    int64  
 7   TomorrowClose      324 non-null    float64
 8   DayReturn          324 non-null    float64
 9   NumTweets          324 non-null    int64  
 10  day                324 non-null    int64  
 11  dayofyear          324 non-null    int64  
 12  weekofyear         324 non-null    int64  
 13  month_2            324 non-null    uint8  
 14  month_3            324 non-null    uint8  
 15  month_4            324 non-null    uint8  
 16  month_5            324 n

In [40]:
tweet = tweet.drop(['month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
         'month_10', 'month_11', 'month_12', 'wday_1','wday_2','wday_3','wday_4','wday_5','wday_6',
            'qrtr_2','qrtr_3','qrtr_4'], axis=1)

In [42]:
tweet = tweet.drop(['m_start_True','m_end_True'], axis = 1)

In [12]:
tweet.head()

In [44]:
corr_matrix = tweet.corr()
corr_matrix["DayReturn"].sort_values(ascending=False)

DayReturn           1.000000
day                 0.085488
TomorrowClose       0.032846
Volume              0.012089
dayofyear           0.005775
tweet_polarity      0.002754
weekofyear         -0.000635
SellCount_Tweets   -0.034979
BuyCount_Tweets    -0.042152
Open               -0.059134
High               -0.059141
Low                -0.059153
AdjClose           -0.059202
Close              -0.059203
NumTweets          -0.060423
Name: DayReturn, dtype: float64

In [46]:
twit = TweetTokenizer()

In [47]:
tweet['tokenized_tweets'] = tweet['lemmatized_tweets'].apply(lambda x: twit.tokenize(x))

In [None]:
##Machine Learning

In [54]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(tweet, test_size = 0.2, random_state = 42)

#Set labels
train_labels = train_set["DayReturn"] 
test_labels = test_set["DayReturn"]

#Drop target from training and testing set
true_train_set = train_set.drop("DayReturn", axis = 1)
true_test_set = test_set.drop("DayReturn", axis = 1)

In [13]:
true_train_set.head()

In [56]:
#dropping lemmatized tweets and tokenized tweets since it is continuous data and the algorithms will not like it.

true_train_set = true_train_set.drop(["lemmatized_tweets", "tokenized_tweets"], axis = 1)
true_test_set = true_test_set.drop(["lemmatized_tweets", "tokenized_tweets"], axis = 1)

In [59]:
#Separate categorical and numerical variables for pipeline

cat_attr = ["Ticker"]
num_attr = ["Open", "High", "Low", "Close", "AdjClose", "Volume", "TomorrowClose", "NumTweets", "day","dayofyear",
           "weekofyear","tweet_polarity","BuyCount_Tweets","SellCount_Tweets"]

In [60]:
#Combining categorical and numerical pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

combine_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attr),
    ("cat", OneHotEncoder(handle_unknown = "ignore"), cat_attr),
    ])

In [61]:
#Fit and Transform Train
trans_train = combine_pipeline.fit_transform(true_train_set)


#Transform Test
trans_test = combine_pipeline.transform(true_test_set)

In [65]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [66]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(trans_train, train_labels)

dayreturn_predictions = tree_reg.predict(trans_train)
tree_mse = mean_squared_error(train_labels, dayreturn_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [71]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(trans_test, test_labels)

dayreturn_predictions = tree_reg.predict(trans_test)
tree_mse = mean_squared_error(test_labels, dayreturn_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [69]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(trans_train, train_labels)

LinearRegression()

In [70]:
dayreturn_predictions = lin_reg.predict(trans_train)
lin_mse = mean_squared_error(train_labels, dayreturn_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1.2013286148054467

In [72]:
lin_reg = LinearRegression()
lin_reg.fit(trans_test, test_labels)

LinearRegression()

In [73]:
dayreturn_predictions = lin_reg.predict(trans_test)
lin_mse = mean_squared_error(test_labels, dayreturn_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.013383738578722607

In [143]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(test_labels, dayreturn_predictions)
lin_mae

0.00869649401118346