Install and Import Required Packages

In [None]:
!pip install snscrape
!pip install yfinance
!pip install vaderSentiment

Collecting snscrape
  Downloading snscrape-0.3.4-py3-none-any.whl (35 kB)
Installing collected packages: snscrape
Successfully installed snscrape-0.3.4
Collecting yfinance
  Downloading yfinance-0.1.70-py2.py3-none-any.whl (26 kB)
Collecting requests>=2.26
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
Collecting lxml>=4.5.1
  Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 8.2 MB/s 
Installing collected packages: requests, lxml, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
[31mERROR: pip's dependency resolver does not currently take int

In [None]:
import pandas as pd
import numpy as np

# Utilities
import re
import string
import datetime

# Twitter Data
import snscrape.modules.twitter as sntwitter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Yahoo Finance
import yfinance as yf

#Text Pre-processing
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer 

# Data Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Fetch Twitter Data

In [None]:
def get_tweets(about,since,until):

  tweets_list = []

  for i,tweet in enumerate(sntwitter.TwitterSearchScraper('{0} + since:{1} until:{2} --en -filter:links -filter:replies'.format(about,since,until)).get_items()):
    tweets_list.append([tweet.date, tweet.id, tweet.content])

  return pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text'])

Fetch Yahoo Finance Data

In [None]:
def get_yfinance_data(company):
  
  data = yf.Ticker(company).history(period='3y')

  return pd.DataFrame(data)

Text Cleaning

In [None]:
def remove_punctuation(input_text):

  remove = string.punctuation
  remove = remove.replace(".", "") # don't remove period
  pattern = r"[{}]".format(remove) # create the pattern
  x = input_text.translate({ord(char): None for char in remove})
  return " ".join([word.strip(string.punctuation) for word in x.split()])

def remove_stopwords(input_text):

  stopwords_list = stopwords.words('english')
  inclusion_list = []
  exclusion_list = ['rt','na','none']
  clean_words = [word for word in input_text.split() if (word not in stopwords_list or word in inclusion_list)] 
  clean_words = [word for word in clean_words if word not in exclusion_list]
  return " ".join(clean_words) 

def lemmatize_words(input_text):

  lemmatizer = WordNetLemmatizer()
  word_list = nltk.word_tokenize(input_text)
  return ' '.join([lemmatizer.lemmatize(w) for w in set(word_list)])

def text_cleaning(input_text):
  
  x = input_text.lower()

  x = re.sub(r'@\w+', '', x)

  x = re.sub(r'-', ' ', x)

  x = re.sub(r'http.?://[^\s]+[\s]?', '', x)

  x = remove_punctuation(x)

  x = x.encode("ascii", "ignore").decode()

  x = remove_stopwords(x)

  x = lemmatize_words(x)    

  x = " ".join([word for word in x.split(' ')]) 

  x = " ".join([word for word in x.split() if len(word)>2]) 

  return x

def sentiment_scores(input_text):
 
  obj = SentimentIntensityAnalyzer()
  return obj.polarity_scores(input_text)

def sentiment_category(input_dic):

  if input_dic['compound'] >= 0.05 :
    return 'Positive' 
  elif input_dic['compound'] <= - 0.05 :
    return 'Negative'
  else :
    return 'Neutral'

Functions to fetch and process data

In [None]:
def fetch_data(keywords, since, till):

  dfs=[]
  for i in keywords:
    dfs.append(get_tweets(i, since, till))
    
  print([ len(i) for i in dfs])

  twitterData = pd.concat(dfs)

  yfinanceData = get_yfinance_data('VMW')
  yfinanceData.Close = round(yfinanceData.Close,2)
  yfinanceData = yfinanceData.reset_index()

  return twitterData, yfinanceData

def prepare_data():

  keywords = ['vmware','esx','vmworld','vmc','vcf','vcenter']
  since = '2020-01-01'
  till = '2022-04-01'

  start = datetime.datetime.strptime(since,'%Y-%m-%d')
  end = datetime.datetime.strptime(till,'%Y-%m-%d')
  daterange = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
  base  = pd.DataFrame(daterange, columns=['Date'])

  twitterData, yfinanceData =fetch_data(keywords, since, till)

  twitterData['Date'] = twitterData.Datetime.values.astype(dtype='datetime64[D]') 
  twitterData['cleanText'] = twitterData.Text.apply(text_cleaning)
  twitterData['words'] = twitterData['cleanText'].apply(lambda x: x.split())
  twitterData['score'] = twitterData.cleanText.apply(sentiment_scores)
  twitterData['sentiment'] = twitterData.score.apply(sentiment_category)
  twitterData = pd.get_dummies(twitterData, columns=['sentiment'])
  twitterData['sentiment'] = twitterData.score.apply(sentiment_category)

  twitterWordCloud = twitterData[['Date','sentiment','words']].explode('words').groupby(['Date','sentiment','words']).size().reset_index(name='# occurence')

  twitterAggData = twitterData.groupby('Date').agg({'sentiment_Negative':'sum', 'Tweet Id':'count'}).reset_index().rename(columns={'sentiment_Negative':'# neg tweets', 'Tweet Id':'# tweets'})

  final_data  = pd.merge(base,yfinanceData[yfinanceData.Date>='2020-01-01'],on='Date',how='left')
  final_data  = pd.merge(final_data,twitterAggData,on='Date',how='left')
  final_data  = final_data[['Date', 'Close', '# neg tweets', '# tweets']]
  final_data.rename(columns={'Close':'Stock Price'}, inplace=True)
  final_data.fillna(0)
  final_data['% neg tweets'] = round( final_data['# neg tweets'] / final_data['# tweets'] , 2)
  final_data['# neg tweets in LXdays'] = final_data['# neg tweets'].rolling(7).sum().fillna(0)
  final_data['# tweets in LXdays'] = final_data['# tweets'].rolling(7).sum()
  final_data['% neg tweets in LXdays'] = round( final_data['# neg tweets in LXdays'] / final_data['# tweets in LXdays'] , 2)
  final_data['Change in stock price'] = final_data['Stock Price'] - final_data['Stock Price'].shift(1)
  final_data['Last 7 days avg stock price'] = final_data['Stock Price'].rolling(window=7,min_periods=1, closed='left').mean()
  final_data['Change in Last 7 days avg stock price'] = final_data['Last 7 days avg stock price'] - final_data['Last 7 days avg stock price'].shift(1)
  final_data.dropna(inplace=True)


  X = final_data[['Last 7 days avg stock price', '# tweets', '% neg tweets', '% neg tweets in LXdays', 'Change in stock price', 'Change in Last 7 days avg stock price']]
  y = final_data['Stock Price']

  X_train, X_test, y_train, y_test = train_test_split(X, 
                                                      y,
                                                      test_size=0.2, 
                                                      shuffle=False)
  print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
  model = LinearRegression()
  model.fit(X_train,y_train)
  print('Logistic Regression Created')
  y_pred = model.predict(X)

  final_data['predicted_value'] = y_pred

  return twitterWordCloud,final_data

  

In [None]:
def main():
  twitterWordCloud,final_data = prepare_data()
  twitterWordCloud.to_csv('word_cloud.csv',index=False)
  final_data.to_csv('final_data.csv',index=False)

In [None]:
if __name__ == "__main__":
  main()

[1815, 1539, 33, 773, 28053, 30]
(356, 6) (90, 6) (356,) (90,)
Logistic Regression Created
