In [1]:
import pandas as pd
import numpy as np

news = pd.read_csv('w2v_news.csv').drop(columns=['Unnamed: 0'])
news = news.dropna()
news.head()

Unnamed: 0,text,label
0,whatsapp limits text forwards to five recipien...,1
1,exclusive tesla holds battery supply talks wit...,1
2,apple is holding a global iphone photography c...,1
3,the pros and cons of buying apple stock ahead ...,1
4,hoosier companies among most admired,1


In [2]:
tweets = pd.read_csv('w2v_tweets.csv').drop(columns=['Unnamed: 0'])
tweets = tweets.dropna()
len(tweets)

84554

In [3]:
tweets.head()

Unnamed: 0,text,label
0,DOLLARSIGN they def set up the open to sell a ...,1
1,DOLLARSIGN DOLLARSIGN DOLLARSIGN DOLLARSIGN go...,0
2,latest apple pay deal offers DOLLARSIGN tacos ...,0
3,DOLLARSIGN DOLLARSIGN easymoneylucy DOLLARSIGN...,1
4,HASHTAG airpods price durability affecting sal...,1


## Method 3: Topic Modeling

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

flagged = news['text'].apply(lambda x: word_tokenize(str(x)))
news['remove_stopwords'] = flagged.apply(lambda x: ' '.join([x for x in x if x not in stop_words]))

In [5]:
import gensim
import gensim.corpora as corpora

words = news['remove_stopwords'].apply(lambda x: str(x).split(' ')).tolist()
id2word = corpora.Dictionary(words)
corpus = [id2word.doc2bow(text) for text in words]

num_topics = 70

lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word,num_topics=num_topics)

def format_topics_sentences(ldamodel=None, corpus=corpus, texts=words):
    sent_topics_df = pd.DataFrame()
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=words)

# Format
lda_df = df_topic_sents_keywords.reset_index()
lda_df.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
lda_df.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,28.0,0.3409,"target, price, aapl, apple, DOLLARSIGN, 00, gi...","[whatsapp, limits, text, forwards, five, recip..."
1,1,13.0,0.2282,"faang, netflix, nflx, stock, apple, hold, bull...","[exclusive, tesla, holds, battery, supply, tal..."
2,2,43.0,0.7521,"alphabet, apple, chief, google, aapl, finally,...","[apple, holding, global, iphone, photography, ..."
3,3,38.0,0.8768,"surges, apple, stock, market, decision, amazon...","[pros, cons, buying, apple, stock, ahead, earn..."
4,4,44.0,0.4812,"2018, technology, sector, update, 12, stock, f...","[hoosier, companies, among, admired]"
5,5,1.0,0.3024,"apple, faangs, na, rates, 29, impact, software...","[weekly, qualcomm, incorporated, nasdaq, qcom,..."
6,6,24.0,0.4335,"tesla, apple, amazon, go, stock, alibaba, vs, ...","[tesla, talks, china, lishen, shanghai, batter..."
7,7,28.0,0.5656,"target, price, aapl, apple, DOLLARSIGN, 00, gi...","[facebook, whatsapp, limits, users, five, text..."
8,8,36.0,0.2171,"stocks, trade, us, january, wall, china, fears...","[stocks, sink, economic, concerns, hit, davos,..."
9,9,13.0,0.6267,"faang, netflix, nflx, stock, apple, hold, bull...","[intel, intc, q4, earnings, preview, client, c..."


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
    
def get_corpus(x):
    x = [i.strip() for i in x]
    words = list(sent_to_words(x))
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [7]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(news['remove_stopwords'].tolist(), news['label'].tolist(), test_size=0.2, random_state=42)

train_corpus, train_id2word, bigram_train = get_corpus(X_train)

test_corpus, test_id2word, bigram_test = get_corpus(X_test)

In [None]:
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
X_train_sentiment = []
for sample in X_train:
    X_train_sentiment.append(sid.polarity_scores(sample)['compound'])
    
X_test_sentiment = []
for sample in X_test:
    X_test_sentiment.append(sid.polarity_scores(sample)['compound'])
    

### Training

In [None]:
train_vecs = []

for i in range(len(train_corpus)):
    top_topics = (
        lda_model.get_document_topics(train_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(70)]
    topic_vec.append(X_train_sentiment[i])
    train_vecs.append(topic_vec)

X = np.array(train_vecs)
y = np.array(y_train)

kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lr_accuracy  = [], []

for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train_kf, y_train_kf = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train_kf)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr_lda = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train_kf)

    y_pred = lr_lda.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
    cv_lr_accuracy.append(accuracy_score(y_val, y_pred))


print(f'Logistic Regression f1 score: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logistic Regression accuracy score: {np.mean(cv_lr_accuracy):.3f} +- {np.std(cv_lr_accuracy):.3f}')

### Test

In [18]:
test_vecs = []
for i in range(len(test_corpus)):
    top_topics = (
        lda_model.get_document_topics(test_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(70)]
    topic_vec.append(X_test_sentiment[i])
    test_vecs.append(topic_vec)

X = np.array(test_vecs)
y = np.array(y_test)

scaler = StandardScaler()
X_test_scale = scaler.fit_transform(X)

y_pred = lr_lda.predict(X_test_scale)

print('Logistic Regression f1 score: ', f1_score(y, y_pred, average='binary'))
print("Logistic Regression accuracy score: ", accuracy_score(y,y_pred))

Logistic Regression f1 score:  0.5221354908552218
Logistic Regression accuracy score:  0.5040506179409852


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models 
import matplotlib.pyplot as plt

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

### Method 1: Word2Vec

In [10]:
news['tokens'] = news['text'].apply(lambda x: x.split())
news

Unnamed: 0,text,label,remove_stopwords,tokens
0,whatsapp limits text forwards to five recipien...,1,whatsapp limits text forwards five recipients ...,"[whatsapp, limits, text, forwards, to, five, r..."
1,exclusive tesla holds battery supply talks wit...,1,exclusive tesla holds battery supply talks chi...,"[exclusive, tesla, holds, battery, supply, tal..."
2,apple is holding a global iphone photography c...,1,apple holding global iphone photography contes...,"[apple, is, holding, a, global, iphone, photog..."
3,the pros and cons of buying apple stock ahead ...,1,pros cons buying apple stock ahead earnings,"[the, pros, and, cons, of, buying, apple, stoc..."
4,hoosier companies among most admired,1,hoosier companies among admired,"[hoosier, companies, among, most, admired]"
...,...,...,...,...
84549,during the last few days of 2016 the talk on w...,0,last days 2016 talk wall street ce,"[during, the, last, few, days, of, 2016, the, ..."
84550,copper rises bitcoin falls tim cook s pay soar...,0,copper rises bitcoin falls tim cook pay soars ...,"[copper, rises, bitcoin, falls, tim, cook, s, ..."
84551,stock market today stocks mixed as fangs advan...,0,stock market today stocks mixed fangs advance ...,"[stock, market, today, stocks, mixed, as, fang..."
84552,during the last few days of 2016 the talk on w...,0,last days 2016 talk wall street ce,"[during, the, last, few, days, of, 2016, the, ..."


In [11]:
from gensim.models import Word2Vec, word2vec
X_train, X_test, y_train, y_test = train_test_split(news['tokens'], news['label'],test_size=0.2)
w2v_model = gensim.models.Word2Vec(X_train,vector_size=100,window=5,min_count=2)
w2v_model.wv.index_to_key

['apple',
 's',
 'the',
 'to',
 'in',
 'aapl',
 'stock',
 'market',
 'as',
 'DOLLARSIGN',
 'by',
 'for',
 'stocks',
 'is',
 'a',
 'of',
 'on',
 'and',
 'inc',
 'has',
 'its',
 'you',
 'tech',
 'dow',
 'management',
 'nasdaq',
 'position',
 'stake',
 'earnings',
 'amazon',
 'million',
 'new',
 '2018',
 'holding',
 'iphone',
 'trade',
 'shares',
 'capital',
 'update',
 'declined',
 'with',
 'are',
 'com',
 'after',
 'at',
 'snapshot',
 'buy',
 'wall',
 'marketwatch',
 'street',
 'china',
 'it',
 'value',
 'this',
 'not',
 'price',
 'from',
 'why',
 'rose',
 'will',
 '3',
 'facebook',
 'news',
 'what',
 'investment',
 'more',
 'investors',
 'p',
 'be',
 'here',
 'report',
 'but',
 'up',
 'cut',
 'day',
 'company',
 'business',
 '5',
 'how',
 '500',
 'co',
 'could',
 'that',
 'top',
 'netflix',
 'us',
 'about',
 'group',
 'now',
 'says',
 '1',
 'microsoft',
 'google',
 'points',
 'lowered',
 'trump',
 'big',
 'share',
 'markets',
 'futures',
 '10',
 'may',
 'valuation',
 '2019',
 'week',
 

In [12]:
words_set = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words_set])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words_set])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words_set])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words_set])


In [13]:
X_train_vect_avg = []
i = 0
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(np.append(v.mean(axis=0),X_train_sentiment[i]))
    else:
        X_train_vect_avg.append(np.append(np.zeros(100, dtype=float),X_train_sentiment[i]))
    i=i+1
        
X_test_vect_avg = []
i = 0
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(np.append(v.mean(axis=0),X_test_sentiment[i]))
    else:
        X_test_vect_avg.append(np.append(np.zeros(100, dtype=float),X_test_sentiment[i]))
    i=i+1

In [17]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train_vect_avg)
X_test_scale = scaler.fit_transform(X_test_vect_avg)
lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

y_pred = lr.predict(X_test_scale)

print('Logistic Regression f1 score: ', f1_score(y_test, y_pred, average='binary'))
print("Logistic Regression accuracy score: ", accuracy_score(y_test,y_pred))

Logistic Regression f1 score:  0.5321260665157582
Logistic Regression accuracy score:  0.5233280113535569


In [19]:
import pickle

pickle.dump(lda_model,open('topic_model.sav','wb'))
pickle.dump(lr_lda,open('lr_topic_model.sav','wb'))
pickle.dump(lr,open('lr.sav','wb'))