In [1]:
import pandas as pd

aapl_news = pd.read_csv('aapl_news_stocks.csv')
amzn_news = pd.read_csv('amzn_news_stocks.csv')
aapl_tweets = pd.read_csv('aapl_tweets_stocks.csv')
amzn_tweets = pd.read_csv('amzn_tweets_stocks.csv')

In [2]:
aapl_news.drop(columns=['Unnamed: 0'],inplace=True)
amzn_news.drop(columns=['Unnamed: 0'],inplace=True)
aapl_tweets.drop(columns=['Unnamed: 0'],inplace=True)
amzn_tweets.drop(columns=['Unnamed: 0'],inplace=True)

In [3]:
aapl_tweets = aapl_tweets.sample(n=65772)
amzn_tweets = amzn_tweets.sample(n=18782)

### Preprocessing

In [4]:
import emoji
import re
import contractions

def bert_preprocess(df,column,stopword_remove):
    # lowercase
    df['preprocess'] = df[column].apply(lambda x: str(x).lower())
    # remove emojis
    df['preprocess'] = df['preprocess'].apply(lambda x: emoji.get_emoji_regexp().sub(u'',x))
    # remove '\n'
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\n',' ',x))
    # remove links
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('http\S+', 'LINK',x))
    # remove hashtags
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\#\w+','HASHTAG',x))
    # remove @
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\@\w+','AMPERSAND',x))
    # remove $
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\$\w+','DOLLARSIGN',x))
    #expand contractions
    temp = expand_contractions(df,'preprocess')
    df['preprocess'] = temp.values
    df['preprocess'] = df['preprocess'].apply(lambda x: ' '.join(x))
    # remove extra whitespace
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub(' +',' ',x))
    df['preprocess'] = df['preprocess'].apply(lambda x: x.strip())
    # remove stop words
    if(stopword_remove):
        words = df['preprocess'].apply(lambda x: word_tokenize(x))
        df['preprocess'] = words.apply(lambda x: ' '.join([x for x in x if x not in stop_words]))
    return df

def w2v_preprocess(df,column,stopword_remove):
    # lowercase
    df['preprocess'] = df[column].apply(lambda x: str(x).lower())
    # remove emojis
    df['preprocess'] = df['preprocess'].apply(lambda x: emoji.get_emoji_regexp().sub(u'',x))
    # remove '\n'
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\n',' ',x))
    # remove links
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('http\S+', 'LINK',x))
    # remove hashtags
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\#\w+','HASHTAG',x))
    # remove @
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\@\w+','AMPERSAND',x))
    # remove $
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub('\B\$\w+','DOLLARSIGN',x))
    #expand contractions
    temp = expand_contractions(df,'preprocess')
    df['preprocess'] = temp.values
    df['preprocess'] = df['preprocess'].apply(lambda x: ' '.join(x))
    # remove punctuation
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
    # remove extra whitespace
    df['preprocess'] = df['preprocess'].apply(lambda x: re.sub(' +',' ',x))
    df['preprocess'] = df['preprocess'].apply(lambda x: x.strip())
    # remove stop words
    if(stopword_remove):
        words = df['preprocess'].apply(lambda x: word_tokenize(x))
        df['preprocess'] = words.apply(lambda x: ' '.join([x for x in x if x not in stop_words]))
    return df

def expand_contractions(df,column):
    temp = df[column].apply(lambda x: x.split())
    expanded_text = []
    for row in temp:
        expanded_words = []
        for word in row:
            expanded_words.append(contractions.fix(word))
        expanded_text.append(expanded_words)
    new_temp = pd.Series(expanded_text)
    return new_temp

In [5]:
aapl_tweets_w2v = w2v_preprocess(aapl_tweets,'body',False)
amzn_tweets_w2v = w2v_preprocess(amzn_tweets,'body',False)
w2v_tweets = pd.concat([aapl_tweets_w2v,amzn_tweets_w2v])
w2v_tweets.to_csv('w2v_tweets.csv')

aapl_tweets_bert = bert_preprocess(aapl_tweets,'body',False)
amzn_tweets_bert = bert_preprocess(amzn_tweets,'body',False)
bert_tweets = pd.concat([aapl_tweets_bert,amzn_tweets_bert])
bert_tweets.to_csv('bert_tweets.csv')

aapl_news_w2v = w2v_preprocess(aapl_news,'title',False)
amzn_news_w2v = w2v_preprocess(amzn_news,'title',False)
w2v_news = pd.concat([aapl_news_w2v,amzn_news_w2v])
w2v_news.to_csv('w2v_news.csv')

aapl_news_bert = bert_preprocess(aapl_news,'title',False)
amzn_news_bert = bert_preprocess(amzn_news,'title',False)
bert_news = pd.concat([aapl_news_bert,amzn_news_bert])
bert_news.to_csv('bert_news.csv')

In [12]:
w2v_tweets['label'] = w2v_tweets['Next Close']-w2v_tweets['Close']
w2v_tweets['label'] = [0 if i < 0 else 1 for i in w2v_tweets['label']]

bert_tweets['label'] = bert_tweets['Next Close']-bert_tweets['Close']
bert_tweets['label'] = [0 if i < 0 else 1 for i in bert_tweets['label']]

w2v_news['label'] = w2v_news['Next Close']-w2v_news['Close']
w2v_news['label'] = [0 if i < 0 else 1 for i in w2v_news['label']]

bert_news['label'] = bert_news['Next Close']-bert_news['Close']
bert_news['label'] = [0 if i < 0 else 1 for i in bert_news['label']]

In [13]:
w2v_tweets[['preprocess','label']].rename(columns={'preprocess':'text'}).to_csv('w2v_tweets.csv')
bert_tweets[['preprocess','label']].rename(columns={'preprocess':'text'}).to_csv('bert_tweets.csv')
w2v_news[['preprocess','label']].rename(columns={'preprocess':'text'}).to_csv('w2v_news.csv')
bert_news[['preprocess','label']].rename(columns={'preprocess':'text'}).to_csv('bert_news.csv')

In [16]:
bert_news[['preprocess','label']].rename(columns={'preprocess':'text'})

Unnamed: 0,text,label
0,whatsapp limits text forwards to five recipien...,1
1,exclusive: tesla holds battery supply talks wi...,1
2,apple is holding a global iphone photography c...,1
3,the pros and cons of buying apple stock ahead ...,1
4,hoosier companies among 'most admired',1
...,...,...
18777,"during the last few days of 2016, the talk on ...",0
18778,"copper rises, bitcoin falls, tim cook's pay so...",0
18779,stock market today: stocks mixed as fangs adva...,0
18780,"during the last few days of 2016, the talk on ...",0


In [20]:
w2v_tweets=w2v_tweets[['preprocess','label']].rename(columns={'preprocess':'text'})
bert_tweets=bert_tweets[['preprocess','label']].rename(columns={'preprocess':'text'})
w2v_news=w2v_news[['preprocess','label']].rename(columns={'preprocess':'text'})
bert_news=bert_news[['preprocess','label']].rename(columns={'preprocess':'text'})

In [22]:
from deep_translator import GoogleTranslator

w2v_tweets['text'].apply(lambda x: GoogleTranslator(source='auto',target='english').translate(x))

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))