In [4]:
import pandas as pd
import numpy as np
import ast

#load datasets to merge
news = pd.read_csv('news_headlines.csv')
twitter = pd.read_csv('twitter_data.csv')


#Twitter dataframe saved lists as string representations of a list, this code will convert them back to lists
twitter['TweetList'] = twitter['TweetList'].apply(lambda x: ast.literal_eval(x))
twitter['TimeList'] = twitter['TimeList'].apply(lambda x: ast.literal_eval(x))

#prove that they are lists
print ("Should be a list: ", type(twitter.iloc[0]['TweetList']))
print ("Should be a list: ", type(twitter.iloc[0]['TimeList']))


#Going to convert the news dataframe to be similar format to the twitter dataframe with one row for each unique
#ticker / date combo with lists of headlines, etc. for that day
ticker_dates = news[['ticker', 'date']].drop_duplicates().values.tolist()
new = []
for t in ticker_dates:
    tick, dt = t
    temp = news[(news['ticker'] == tick) & (news['date'] == dt)]
    new.append([
        dt,
        tick,
        temp['headline'].tolist(),
        temp['source'].tolist(),
        temp['summary'].tolist(),
        temp['url'].tolist(),
        temp['headline_count'].tolist()
    ])
    
#re-create news dataframe
news = pd.DataFrame(new, columns=['Date', 'Ticker', 'NewsHeadlineList', 'NewsSourceList', 'NewsSummaryList',
                                 'NewsUrlList', 'NewsHeadlineCountList'])

#cut off news dataframe so it contains the same time frame as the twitter data
news = news[(news['Date'] >= twitter['Date'].min()) & (news['Date'] <= twitter['Date'].max())]

#proof that the news dataframe has more dates
print ("# of dates in twitter that are not in news: ", len([i for i in twitter['Date'].tolist() if i not in news['Date'].tolist()]))

#merge the 2 datasets
data = news.merge(twitter, how='left', on=['Ticker', 'Date'])

#rename column so its clear its about tweets
data = data.rename(columns={'TimeList': 'TweetTimeList'})

#saving the dataset as a pickle to preserve lists within dataframe
data.to_pickle('stock_prediction_data.csv')

#read the data back into a dataframe
data = pd.read_pickle('stock_prediction_data.csv')

#lists are preserved
print ("Headline types: ", type(data.iloc[0]['NewsHeadlineList']))
print ("Tweet types: ", type(data.iloc[0]['TweetList']))

Should be a list:  <class 'list'>
Should be a list:  <class 'list'>
# of dates in twitter that are not in news:  0
Headline types:  <class 'list'>
Tweet types:  <class 'list'>


In [5]:
#final dataset: news includes weekends / holidays while twitter is only days when market is open
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   428 non-null    object 
 1   Ticker                 428 non-null    object 
 2   NewsHeadlineList       428 non-null    object 
 3   NewsSourceList         428 non-null    object 
 4   NewsSummaryList        428 non-null    object 
 5   NewsUrlList            428 non-null    object 
 6   NewsHeadlineCountList  428 non-null    object 
 7   TweetList              325 non-null    object 
 8   TweetTimeList          325 non-null    object 
 9   Open                   325 non-null    float64
 10  High                   325 non-null    float64
 11  Low                    325 non-null    float64
 12  Close                  325 non-null    float64
 13  AdjClose               325 non-null    float64
 14  Volume                 325 non-null    float64
dtypes: flo

In [6]:
data

Unnamed: 0,Date,Ticker,NewsHeadlineList,NewsSourceList,NewsSummaryList,NewsUrlList,NewsHeadlineCountList,TweetList,TweetTimeList,Open,High,Low,Close,AdjClose,Volume
0,2021-01-04,TSLA,"[2 Tesla Analysts Break Down Fundamentals, Val...","[benzinga, Nasdaq, MarketWatch, MarketWatch, M...",[Tesla Inc (NASDAQ: TSLA) nearly met what many...,[https://finnhub.io/api/news?id=f603940805be2c...,"[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, ...",[@TeslaArmy @elonmusk Congrats!!! $TSLA def h...,"[2021-01-05T04:59:58.000Z, 2021-01-05T04:59:54...",719.460022,744.489990,717.190002,729.770020,729.770020,48638200.0
1,2021-01-05,TSLA,[Tesla Gets Regulatory Nod To Begin Sales In I...,"[benzinga, benzinga, benzinga, benzinga, Nasda...",[Tesla Inc (NASDAQ: TSLA) is expected to begin...,[https://finnhub.io/api/news?id=57fd121e395825...,"[1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[RT @garyblack00: The U.S. is nearing a point ...,"[2021-01-06T04:59:56.000Z, 2021-01-06T04:59:52...",723.659973,740.840027,719.200012,735.109985,735.109985,32245200.0
2,2021-01-06,TSLA,[Tesla Option Traders Are Dumping Massive Amou...,"[benzinga, benzinga, benzinga, benzinga, benzi...",[Tesla Inc (NASDAQ: TSLA) gained another 2.8% ...,[https://finnhub.io/api/news?id=1fd0defa5d386b...,"[2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",[@capital_sb Interesting.... Cathie been prett...,"[2021-01-07T04:59:29.000Z, 2021-01-07T04:58:56...",758.489990,774.000000,749.099976,755.979980,755.979980,44700000.0
3,2021-01-07,TSLA,"[Elon Musk, Jeff Bezos Battling It Out For The...","[benzinga, businesswire, benzinga, benzinga, b...",[Tesla Inc (NASDAQ: TSLA) CEO Elon Musk woke u...,[https://finnhub.io/api/news?id=f6403287dd23a1...,"[1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[RT @jasondebolt: Today I’m retiring from the ...,"[2021-01-08T04:59:51.000Z, 2021-01-08T04:59:32...",777.630005,816.989990,775.200012,816.039978,816.039978,51498900.0
4,2021-01-08,TSLA,"[Tesla To Launch $25,000 EV Sedan In China By ...","[benzinga, benzinga, benzinga, benzinga, benzi...",[Tesla Inc (NASDAQ: TSLA) is making its vehicl...,[https://finnhub.io/api/news?id=15e92cf65a4b9f...,"[2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, ...",[RT @Teslaconomics: I BOUGHT THE ROADSTER! 🔥🔥🔥...,"[2021-01-09T04:59:57.000Z, 2021-01-09T04:59:31...",856.000000,884.489990,838.390015,880.020020,880.020020,75055500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,2021-04-26,AMD,[Chipmaker stocks are going to be ‘the key’ to...,"[Yahoo, Yahoo, Yahoo, Yahoo, CNBC, SeekingAlph...",[Semiconductor stock performance could help in...,[https://finnhub.io/api/news?id=ab9036cbda0d18...,"[1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, ...",[RT @eWhispers: #earnings for the week \n\nhtt...,"[2021-04-27T03:59:27.000Z, 2021-04-27T03:57:40...",83.349998,85.900002,82.699997,85.410004,85.410004,57594500.0
424,2021-04-27,AMD,"[Dow Jones Futures: Microsoft, Google, AMD Lea...","[Yahoo, Reuters, Nasdaq, Yahoo, Yahoo, Yahoo, ...",[The major indexes held steady Tuesday while M...,[https://finnhub.io/api/news?id=656ff29cac9964...,"[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, ...",[RT @johnscharts: $AMD AH on earnings https://...,"[2021-04-28T03:58:40.000Z, 2021-04-28T03:58:29...",85.669998,87.150002,85.129997,85.209999,85.209999,61909900.0
425,2021-04-28,AMD,"[U.S. stocks end lower, despite Powell vow to ...","[Yahoo, Yahoo, CNBC, Yahoo, DowJones, MarketWa...",[Stocks finished lower Wednesday after Federal...,[https://finnhub.io/api/news?id=8f2b14778cbd01...,"[1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, ...",[$AMD great earnings but chip shortage bit the...,"[2021-04-29T03:57:46.000Z, 2021-04-29T03:56:40...",88.849998,89.199997,83.919998,84.019997,84.019997,108920300.0
426,2021-04-29,AMD,"[Top Research Reports for Amazon, United Parce...","[Yahoo, SeekingAlpha, Yahoo, Yahoo, MarketWatc...","[Top Research Reports for Amazon, United Parce...",[https://finnhub.io/api/news?id=a03f5d42e7f344...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",[learn trading and get instant help click here...,"[2021-04-30T03:58:46.000Z, 2021-04-30T03:43:03...",84.699997,85.269997,82.629997,83.910004,83.910004,51294500.0
