<a href="https://colab.research.google.com/github/dieko95/AlgoTrading/blob/develop/AlgoTrading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AlgoTrading

## Download Alt Data (Tweets)

https://pypi.org/project/GetOldTweets3/



```
python Exporter.py --query @RedHat --since 2014-01-01 --until 2019-01-15 --maxtweets 5000000
```



In [1]:
# !pip install GetOldTweets3
# !pip install langdetect
# !pip install vaderSentiment



In [0]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np 
from langdetect import detect
import time


In [188]:
# !pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

link = 'https://drive.google.com/open?id=1NSuc3X37Z2r29mxKt92JFs0krF63aNNM'
fluff, id = link.split('=')

print(id)


1NSuc3X37Z2r29mxKt92JFs0krF63aNNM


In [0]:
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
tweetsDf = pd.read_csv('Filename.csv')

In [190]:
tweetsDf.shape

(51603, 10)

##  Loading Prototype - snippet

![alt text](https://cdn-images-1.medium.com/max/800/1*G8yV2iaqqfaGfmRPRem2Fw.png)

In [0]:
# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))

In [0]:
# tweetsDf = pd.read_csv('redhatprototype.csv')

## First Cleaning Iteration

In [191]:
# Erasing Short Tweets

start_time = time.time()

tweetsDf.text = tweetsDf.text.apply(lambda x: len(str(x).split()) < 3 and np.nan or x) #If the tweet is less than 7 characters create a NA place Holder 


# Erasing Misc Characters 

tweetsDf.text = tweetsDf.text.replace(r'@\w*', '', regex = True) # Erases all usernames (word preceding @)
tweetsDf.text = tweetsDf.text.replace(r'#\w*', '', regex = True) # Erases hashtags
tweetsDf.text = tweetsDf.text.replace(r'https://.+', '', regex = True) # Erases URLS

print("Middle of program--- %s seconds ---" % (time.time() - start_time))

tweetsDf.text = tweetsDf.text.replace(r'http://.+', '', regex = True) # Erases URLS
tweetsDf.text = tweetsDf.text.replace(r'=>', '', regex = True) # Erases =>
tweetsDf.text = tweetsDf.text.replace(r'[$-/:-?{-~"^_`\[\]]', '', regex = True) #Replace weird characters
tweetsDf.text = tweetsDf.text.apply(lambda x: len(str(x).split()) < 3 and np.nan or x) #If the tweet is less than 3 characters create a NA place Holder 



tweetsDf = tweetsDf.dropna(subset=['text'])


# Erasing Non English Tweets 

print("Before detect --- %s seconds ---" % (time.time() - start_time))


tweetsDf.text = tweetsDf.text.apply(lambda x: detect(str(x)) != 'en' and np.nan or x) # Creates NA place holder for text that is not in english 

tweetsDf = tweetsDf.dropna(subset=['text'])



print("--- %s seconds ---" % (time.time() - start_time))


Middle of program--- 0.25501370429992676 seconds ---
--- 276.7730143070221 seconds ---


In [91]:
# Tweets that were deleted - 110 for RHT 
print(tweetsDf.describe(), '\n','\n'
     'Tweets Without Info: ', tweetsDf.shape[0] - tweetsDf.text.dropna().shape[0])

tweetsDf = tweetsDf.dropna(subset=['text'])
tweetsDf.shape

           retweets     favorites
count  51603.000000  51603.000000
mean       2.851714      4.312811
std       18.691312     20.509410
min        0.000000      0.000000
25%        0.000000      0.000000
50%        0.000000      1.000000
75%        3.000000      4.000000
max     2877.000000   2330.000000 
 
Tweets Without Info:  110


In [0]:
start_time = time.time()

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


analyzer = SentimentIntensityAnalyzer()

tweet = []
number_favourites = []
retweets = []
vs_compound = []
vs_pos = []
vs_neu = []
vs_neg = []
timeStamp = []


for i in range(tweetsDf.shape[0]):
    tweet.append(tweetsDf.iloc[i,4])
    number_favourites.append(tweetsDf.iloc[i,3])
    retweets.append(tweetsDf.iloc[i,2])
    timeStamp.append(tweetsDf.iloc[i,1])
    vs_compound.append(analyzer.polarity_scores(tweetsDf.iloc[i,4])['compound'])
    vs_pos.append(analyzer.polarity_scores(tweetsDf.iloc[i,4])['pos'])
    vs_neu.append(analyzer.polarity_scores(tweetsDf.iloc[i,4])['neu'])
    vs_neg.append(analyzer.polarity_scores(tweetsDf.iloc[i,4])['neg'])
    
    
print("--- %s seconds ---" % (time.time() - start_time))

In [194]:

twitterDf = DataFrame({'Tweet': tweet,
                        'Favourites': number_favourites,
                        'retweets' : retweets,
                        'timeStamp' : timeStamp,
                        'Compound': vs_compound,
                        'Positive': vs_pos,
                        'Neutral': vs_neu,
                        'Negative': vs_neg})
twitterDf = twitterDf[['timeStamp','Tweet', 'Favourites', 'retweets','Compound',
                         'Positive', 'Neutral', 'Negative']]

twitterDf.shape

(44801, 8)

In [0]:
from google.colab import files

twitterDf.to_csv('cleanedTwitter_v0.1.csv')
files.download('cleanedTwitter_v0.1.csv')

## Aggregating By Day

In [0]:
aggDf = twitterDf.drop(columns = 'Tweet')
aggDf.timeStamp = pd.to_datetime(aggDf.timeStamp) 
aggDf.set_index(aggDf["timeStamp"],inplace=True)
aggDfDay = aggDf.resample('D').sum()

# aggDfDay.Compound.value_counts()

aggDfDay

In [0]:
from google.colab import files

aggDfDay.to_csv('aggDfDay_v0.1.csv')
files.download('aggDfDay_v0.1.csv')