#### Measuring the interaction of news and stock price

In [1]:
import os
import re
import glob
from pathlib import Path
import pandas as pd
import numpy as np
import yfinance as yf
from useful.eda import basic_info

FAT_BAR = '='*50

In [2]:
DATA_DIR = os.path.join(str(Path.cwd().parent)+'/data/')
files = list(filter(lambda x: x.endswith('.json'), os.listdir(DATA_DIR)))

# for f in [f for f in os.listdir(DATA_DIR) if f.endswith('.zip')]:
#     ! cd .. && unzip data/{f} -d data/ && cd notebooks

In [6]:
df = pd.read_json(DATA_DIR + files[0],lines=True); df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


##### Clean up the dataset

In [7]:
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import bigrams

STEMMER = SnowballStemmer("english", ignore_stopwords=True)

def preprocess(text: pd.Series, *args):
    text = text.apply(gensim.utils.simple_preprocess, min_len=3)
    sw = set(stopwords.words('english'))

    text = text.apply(lambda s: [w for w in s if w not in sw])
    text = text.apply(lambda s: [STEMMER.stem(w) for w in s])
    text = text.apply(lambda s: ['_'.join(x) for x in nltk.bigrams(s)] + s)

    return text

In [8]:
df = df[['category','headline']]; df.head()

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [10]:
df['category'].value_counts()

POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
WORLDPOST          2579
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
Name: category, 

In [13]:
df = df[df['category'].isin(['POLITICS','BUSINESS','ENTERTAINMENT','SCIENCE'])]; df.head()

Unnamed: 0,category,headline
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...


In [15]:
df['headline'] = preprocess(df['headline']); df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,category,headline
1,ENTERTAINMENT,"[smith_join, join_diplo, diplo_nicki, nicki_ja..."


##### BOW model

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000)

##### Tf-idf

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [43]:
x_bow = bow.fit_transform(df['headline'].apply(lambda x: ', '.join(map(str, x))))
x_tf_idf = tfidf.fit_transform(df['headline'].apply(lambda x: ', '.join(map(str, x))))

##### Create target set

In [55]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

col = ['POLITICS','BUSINESS','ENTERTAINMENT','SCIENCE']

Y = df['category'].replace(col,[x for x in range(1,5)])

##### NB - bow v. tf-idf

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

clf = MultinomialNB()

In [59]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(x_bow, Y, test_size = 0.2, random_state = 42)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tf_idf, Y, test_size = 0.2, random_state = 42)



In [61]:
#bag or words model
clf.fit(X_train_bow,y_train_bow)

y_pred_bow = clf.predict(X_test_bow)

print(classification_report(y_test_bow, y_pred_bow))

              precision    recall  f1-score   support

           1       0.91      0.91      0.91      6581
           2       0.66      0.71      0.68      1172
           3       0.89      0.88      0.89      3171
           4       0.73      0.67      0.70       459

    accuracy                           0.87     11383
   macro avg       0.80      0.79      0.80     11383
weighted avg       0.87      0.87      0.87     11383



In [62]:
#tfidf or words model
clf.fit(X_train_tfidf,y_train_tfidf)

y_pred_tfidf = clf.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, y_pred_tfidf))

              precision    recall  f1-score   support

           1       0.80      0.99      0.89      6581
           2       0.96      0.18      0.30      1172
           3       0.91      0.87      0.89      3171
           4       1.00      0.14      0.24       459

    accuracy                           0.84     11383
   macro avg       0.92      0.54      0.58     11383
weighted avg       0.86      0.84      0.80     11383



##### SVM

In [63]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

svc = SVC(C=1.0, kernel='linear', class_weight='balanced')

In [65]:
#bag or words model
svc.fit(X_train_bow,y_train_bow)

y_pred_bow = svc.predict(X_test_bow)

print(classification_report(y_test_bow, y_pred_bow))

#tfidf or words model
svc.fit(X_train_tfidf,y_train_tfidf)

y_pred_tfidf = svc.predict(X_test_tfidf)

print(classification_report(y_test_tfidf, y_pred_tfidf))

              precision    recall  f1-score   support

           1       0.94      0.87      0.90      6581
           2       0.54      0.78      0.64      1172
           3       0.89      0.86      0.87      3171
           4       0.59      0.68      0.63       459

    accuracy                           0.85     11383
   macro avg       0.74      0.80      0.76     11383
weighted avg       0.87      0.85      0.86     11383

              precision    recall  f1-score   support

           1       0.94      0.90      0.92      6581
           2       0.64      0.79      0.71      1172
           3       0.90      0.91      0.90      3171
           4       0.76      0.72      0.74       459

    accuracy                           0.88     11383
   macro avg       0.81      0.83      0.82     11383
weighted avg       0.89      0.88      0.89     11383

