# Combine price data with twitter data
## Imports

In [2]:
import pandas as pd
from datetime import datetime
import os

## Read raw Twitter data and format

In [14]:
os.chdir('/Users/stevelee/Documents/Research/DigitalAsset_Analysis/data')
df_twitter = pd.read_csv('twitter_crypto.csv')
df_twitter['Date'] = pd.to_datetime(df_twitter['Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_twitter = df_twitter.set_index('Date').drop('Unnamed: 0', axis=1)
df_twitter.head()

Unnamed: 0_level_0,TweetId,Text,Retweets,Favorites
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-15 23:59:28,8.976087e+17,Ok,0.0,0.0
2017-08-15 23:59:24,8.976087e+17,Brendan busy spreading the crypto gospel to @f...,0.0,0.0
2017-08-15 23:59:10,8.976086e+17,I invited Steemit user silverbit to my Crypto ...,0.0,0.0
2017-08-15 23:59:10,8.976086e+17,Take your financial freedom back and buy Crypt...,3.0,6.0
2017-08-15 23:58:52,8.976086e+17,Accept Bitcoin and other crypto in your online...,1.0,0.0


## Read price data and format

In [18]:
df_price = pd.read_csv('coinData_hourly(Aug17-April18).csv')
df_price['Time'] = pd.to_datetime(df_price['Time'], unit='s')
df_price = df_price.set_index('Time').drop('Unnamed: 0', axis=1)
df_price.index = df_price.index.map(lambda x: x.replace(second=0))
df_price.head()

Unnamed: 0_level_0,BTC,ETH,LTC,Volume_BTC,Volume_Eth,Volume_LTC,Volume_XRP,XRP
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-08-17 14:00:00,4440.544599,309.515187,44.228847,0.076872,17.338063,17.463,618.44108,0.157946
2017-08-17 15:00:00,4456.327055,306.33331,44.273498,0.709464,2.999052,1.10825,4431.120239,0.158022
2017-08-17 16:00:00,4380.102847,304.43885,44.102044,0.014249,0.157371,17.64,283.563,0.156558
2017-08-17 17:00:00,4307.458884,300.147892,43.651379,0.279529,0.025056,12.3563,546.63675,0.155399
2017-08-17 18:00:00,4301.53803,302.484477,43.773482,3.0,2.0,22.88,1202.999924,0.15678


## Resample Twitter data

In [19]:
df_twitterSummary = pd.DataFrame()
df_twitterSummary['Retweets'] = df_twitter['Retweets'].resample('H').sum()
df_twitterSummary['Favorites'] = df_twitter['Favorites'].resample('H').sum()
df_twitterSummary['NumTweets'] = df_twitter['Favorites'].resample('H').count()
df_twitterSummary.head()

Unnamed: 0_level_0,Retweets,Favorites,NumTweets
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-15 00:00:00,1002.0,1505.0,304
2017-08-15 01:00:00,120.0,246.0,277
2017-08-15 02:00:00,293.0,272.0,218
2017-08-15 03:00:00,64.0,159.0,237
2017-08-15 04:00:00,214.0,448.0,229


## Merge on index

In [21]:
df_all = df_price.merge(df_twitterSummary, how='outer', left_index=True, right_index=True).dropna()
df_all

Unnamed: 0,BTC,ETH,LTC,Volume_BTC,Volume_Eth,Volume_LTC,Volume_XRP,XRP,Retweets,Favorites,NumTweets
2017-08-17 14:00:00,4440.544599,309.515187,44.228847,0.076872,17.338063,17.463000,618.441080,0.157946,414.0,667.0,335.0
2017-08-17 15:00:00,4456.327055,306.333310,44.273498,0.709464,2.999052,1.108250,4431.120239,0.158022,340.0,720.0,370.0
2017-08-17 16:00:00,4380.102847,304.438850,44.102044,0.014249,0.157371,17.640000,283.563000,0.156558,616.0,1100.0,372.0
2017-08-17 17:00:00,4307.458884,300.147892,43.651379,0.279529,0.025056,12.356300,546.636750,0.155399,2029.0,928.0,426.0
2017-08-17 18:00:00,4301.538030,302.484477,43.773482,3.000000,2.000000,22.880000,1202.999924,0.156780,265.0,652.0,413.0
2017-08-17 19:00:00,4242.650309,300.186109,43.511223,0.004704,0.999979,15.115076,1922.000000,0.155655,563.0,954.0,418.0
2017-08-17 20:00:00,4258.445033,299.486989,43.285715,0.052984,1.190000,4.100000,36.687501,0.154807,6202.0,32430.0,334.0
2017-08-17 21:00:00,4284.504483,301.463265,43.641581,0.025617,2.332000,9.440000,4066.339999,0.155901,184.0,563.0,382.0
2017-08-17 22:00:00,4335.512596,303.608641,43.726677,0.011497,15.701980,380.849727,705.171213,0.157892,105.0,400.0,303.0
2017-08-17 23:00:00,4306.875248,303.864289,44.089693,0.648049,2.000000,3.591303,10000.000000,0.158582,411.0,657.0,268.0


## Write to csv

In [22]:
df_all.to_csv('sampleTwitterData.csv')