# Baseline model
1. Firstly, we transform the headline through CountVectorizer with 1-ngram, 2-ngram, and 3-ngram model
2. Fit the n-gram embedding into Linear Support Vector Machine to predict the stock trend whether it would rise or drop

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.svm import SVC
import os
import quandl
import pandas as pd
import string
from nltk.corpus import stopwords
stop_wrd = set(stopwords.words('english'))

def filter_sec(df):
    filter_cate = ['N.Y. / Region', 'Today’s Paper', 'Multimedia/Photos', 'U.S.', 'Reader Center', 'Magazine', 'Technology',
    'Opinion', 'International Home', 'Job Market', 'Briefing', 'Education', 'World', 'Business', 'Your Money',
    'NYT Now', 'Podcasts', 'New York', 'Real Estate', 'Sunday Review', 'Business Day', 'Neediest Cases''T Magazine', 'Times Insider',
     'Public Editor', 'Universal'] 
    df_1 = df[df.section.isin(filter_cate)]
    return df_1

def index_fluc(value):
    if value >0:
        return 1
    elif value <0:
        return -1
    else:
        return 0
    
def read_news_data():
    path = os.path.join('../../Data', 'processed_data/')
    # read news data from 2016 to 2018
    NYT_2016 = pd.read_json(path+'NYT/NYT_headline_2016.json')
    NYT_2017 = pd.read_json(path+'NYT/NYT_headline_2017.json')
    NYT_2018 = pd.read_json(path+'NYT/NYT_headline_2018.json')
    Guard_2016 = pd.read_json(path+'Guardian/Guard_headline_2016.json')
    Guard_2017 = pd.read_json(path+'Guardian/Guard_headline_2017.json')
    Guard_2018 = pd.read_json(path+'Guardian/Guard_headline_2018.json')
    ttl = [NYT_2016, NYT_2017, NYT_2018, Guard_2016, Guard_2017, Guard_2018]
    ttl_1 = pd.concat(ttl, sort=False)
    return ttl_1

def stocks(start_date, end_date):
# use quandl to acquire nasdaq composite
    ndq = quandl.get("NASDAQOMX/COMP-NASDAQ",
                        start_date = start_date, 
                        end_date = end_date)
    ndq_df = ndq.reset_index()
    return ndq_df

def process(news_df, stocks_df):
    news_df1 = news_df[['date', 'news_header']]
    news_df2 = news_df1.groupby('date').cumcount() + 1
    result_3 = result_1.set_index(['date', result_2]).unstack().sort_index(1, level=1)
    result_3.columns = ['_'.join(map(str,i)) for i in result_3.columns]
    result_3 = result_3.reset_index()
    result_3['date'] = pd.to_datetime(result_3['date'])
    
def process_hd(df):
    # Removing punctuations
    headline_df= df.iloc[:,1:-3]
    headline_df.replace(to_replace="[^a-zA-Z]+", value=" ", regex=True, inplace=True)
    
    # Renaming column names for ease of access
    list1= [i for i in range(25)]
    new_Index=[str(i) for i in list1]
    headline_df.columns= new_Index

    # Convertng headlines to lower case and remove stop word
    for index in new_Index:
        headline_df[index]=headline_df[index].apply(lambda x:" ".join(x for x in str(x).split() if not x in stop_wrd))
        headline_df[index]=headline_df[index].str.lower()
    # collect headlines into list
    headlines = []
    for row in range(0,len(headline_df.index)):
        headlines.append(' '.join(str(x) for x in headline_df.iloc[row,0:25]))

    return headlines

def concat_headline(df):
    result_1 = df[['date', 'news_header']]
    result_2 = result_1.groupby('date').cumcount() + 1
    result_3 = result_1.set_index(['date', result_2]).unstack().sort_index(1, level=1)
    result_3.columns = ['_'.join(map(str,i)) for i in result_3.columns]
    result_3 = result_3.reset_index()
    result_3['date'] = pd.to_datetime(result_3['date'])
    return result_3

df = read_news_data()
# filter news section
df_1 = filter_sec(df)
stocks_df1 = stocks('2016-01-01', '2018-12-31')
stocks_df2 = stocks_df1[['Trade Date', 'Index Value']]
df_2 = concat_headline(df_1)
df_3 = df_2.merge(stocks_df2, left_on='date', right_on='Trade Date')
# get the difference of Nasdaq index with previous date
df_3['diff'] = df_3['Index Value'].diff()
df_3['diff'] = df_3['diff'].fillna(0.0)
df_3['diff'] = df_3['diff'].shift(-1)
# # column reorder and get 25 news headline
cols = df_3.columns.tolist()
cols = cols[:26] + cols[-2:]
df_5 = df_3[cols]
# create a label col to indicate '1' as the index rise and '-1' as index drop
df_5['Label'] = df_5['diff'].apply(index_fluc)
df_6 = df_5.loc[df_5['Label'] !=0]
# divide train and test dataset
train = df_6[df_6['date'] < '20180531']
test = df_6[df_6['date'] >= '20180601']

# train.head()
train_headlines = process_hd(train)
test_headlines = process_hd(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# Linear Support Vector Classification
basicvectorizer = CountVectorizer(ngram_range=(1,1))
basictrain = basicvectorizer.fit_transform(train_headlines)
basicmodel = svm.LinearSVC(C=0.1, class_weight='balanced')
basicmodel = basicmodel.fit(basictrain, train["Label"])
basictest = basicvectorizer.transform(test_headlines)
predictions1_1 = basicmodel.predict(basictest)

In [12]:
# print(basictrain.shape)

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

print (classification_report(test["Label"], predictions1_1))
print (accuracy_score(test["Label"], predictions1_1))

             precision    recall  f1-score   support

         -1       0.56      0.50      0.53        68
          1       0.60      0.65      0.62        77

avg / total       0.58      0.58      0.58       145

0.5793103448275863


In [13]:
# Linear Support Vector Classification
basicvectorizer = CountVectorizer(ngram_range=(2,2))
basictrain1 = basicvectorizer.fit_transform(train_headlines)
basicmodel_2 = svm.LinearSVC(C=0.1, class_weight='balanced')
basicmodel_2 = basicmodel_2.fit(basictrain1, train["Label"])
basictest_2 = basicvectorizer.transform(test_headlines)
predictions2 = basicmodel_2.predict(basictest_2)

In [14]:
print (classification_report(test["Label"], predictions2))
print (accuracy_score(test["Label"], predictions2))

             precision    recall  f1-score   support

         -1       0.51      0.28      0.36        68
          1       0.55      0.77      0.64        77

avg / total       0.53      0.54      0.51       145

0.5379310344827586


In [15]:
# Linear Support Vector Classification
basicvectorizer = CountVectorizer(ngram_range=(3,3))
basictrain3 = basicvectorizer.fit_transform(train_headlines)
basicmodel_3 = svm.LinearSVC(C=0.1, class_weight='balanced')
basicmodel_3 = basicmodel_3.fit(basictrain3, train["Label"])
basictest_3 = basicvectorizer.transform(test_headlines)
predictions3 = basicmodel_3.predict(basictest_3)

In [16]:
print (classification_report(test["Label"], predictions3))
print (accuracy_score(test["Label"], predictions3))

             precision    recall  f1-score   support

         -1       1.00      0.01      0.03        68
          1       0.53      1.00      0.70        77

avg / total       0.75      0.54      0.38       145

0.5379310344827586
