In [80]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [151]:
news_scraping_results = pd.read_csv("./news_scraping_results_and_label.csv")

In [152]:
news_scraping_results = news_scraping_results.dropna(subset=['sentiment'])

In [153]:
tickers_list = pd.read_csv("./tickers_list.csv")

In [154]:
suspended_ticker_news = news_scraping_results[news_scraping_results['SYMBOL'].isin(tickers_list['Symbol'].values)]

In [155]:
active_ticker_news = news_scraping_results[~news_scraping_results['SYMBOL'].isin(tickers_list['Symbol'].values)]

In [145]:
active_ticker_news.index.values

2018

In [150]:
active_ticker_news[active_ticker_news['sentiment']=='negative']

Unnamed: 0,SYMBOL,SITE_NAME,URL,PUBLISH_TIME,SCRAPED_TIME,TITLE,CONTENT,sentiment
14,BIO,GlobeNewswire,http://www.globenewswire.com/news-release/2019...,2019-10-09 07:49:00+00:00,2019-10-09-16-24,Global Apoptosis Industry - GlobeNewswire,new york oct 09 2019 globe newswire reportlink...,negative
54,ADNT,Yahoo Finance,https://finance.yahoo.com/news/adient-aided-tu...,2019-10-09 14:04:00+00:00,2019-10-09-16-24,Adient Aided by Turnaround Plans Amid Industry...,adient plcs adnt share have rally 31 so far th...,negative
56,ADNT,Yahoo Finance,https://finance.yahoo.com/news/adient-discuss-...,2019-10-04 15:00:00+00:00,2019-10-09-16-24,Adient to discuss Q4 fiscal 2019 financial res...,plymouth mich oct 4 2019 prnewswire adient adn...,negative
67,AOSL,Yahoo Finance,https://finance.yahoo.com/news/alpha-omega-sem...,2019-08-07 07:00:00+00:00,2019-10-09-16-24,Alpha and Omega Semiconductor (AOSL) Tops Q4 E...,alpha and omega semiconductor aosl come out wi...,negative
70,AOSL,Yahoo Finance,https://finance.yahoo.com/news/alpha-omega-sem...,2019-05-20 07:00:00+00:00,2019-10-09-16-24,Alpha and Omega Semiconductor to Present at th...,sunnyvale calif business wire alpha and omega ...,negative
...,...,...,...,...,...,...,...,...
2412,MLCO,Reuters,https://www.reuters.com/article/us-crown-resor...,2019-08-08 07:00:00+00:00,2019-10-09-16-24,Australian gaming watchdog to review Melco's $...,file photo a logo on crown towers as part of c...,negative
2413,CCL,Reuters,https://www.reuters.com/article/us-cuba-usa-tr...,2018-12-11 08:00:00+00:00,2019-10-09-16-24,Americans venture back to Cuba as hurricane me...,havana reuters u s travel to cuba be bounce ba...,negative
2458,BYND,Reuters,https://www.reuters.com/article/us-mcdonald-s-...,2019-09-26 07:00:00+00:00,2019-10-09-16-24,McDonald's joins Beyond Meat bandwagon with Ca...,reuters mcdonalds corp mcd n will test a new p...,negative
2472,NRG,Reuters,https://www.reuters.com/article/us-new-york-co...,2019-05-10 07:00:00+00:00,2019-10-09-16-24,New York adopts rules to phase out coal power ...,new york reuters new york environmental regula...,negative


In [156]:
balanced_news = active_ticker_news[active_ticker_news['sentiment'] == 'positive'].reset_index()
balanced_news = balanced_news.append(active_ticker_news[active_ticker_news['sentiment'] == 'negative'].reset_index()[:237], ignore_index=True)
balanced_news = balanced_news.append(active_ticker_news[active_ticker_news['sentiment'] == 'neutral'].reset_index()[:237], ignore_index=True)

In [160]:
suspended_balanced_news = balanced_news.append(suspended_ticker_news.reset_index(), ignore_index=True)

In [88]:
# whole dataset with suspended and balanced active stock
X, y = suspended_balanced_news[['CONTENT']].values.flatten().astype(str), suspended_balanced_news[['sentiment']].values.flatten().astype(str)

In [162]:
#dataset without suspended stock 
Xb, yb = balanced_news[['CONTENT']].values.flatten().astype(str), balanced_news[['sentiment']].values.flatten().astype(str)

In [179]:
Xs, ys = suspended_ticker_news[['CONTENT']].values.flatten().astype(str), suspended_ticker_news[['sentiment']].values.flatten().astype(str)

In [163]:
# split the dataset into training and validation datasets 
train_X, test_X, train_y, test_y = model_selection.train_test_split(Xb, yb, test_size=0.2, random_state=42)

In [164]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [180]:
test_ys = encoder.fit_transform(ys)

In [165]:
encode_y = encoder.fit_transform(y)

In [92]:
#encode_y.classes_

In [93]:
#pd.DataFrame(encode_y, columns=['sentiment']).to_csv('balanced_encoded_labels.csv')

In [166]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [95]:
#count_vect.vocabulary_

In [167]:
vector = count_vect.transform(X)
# summarize encoded vector
#print(vector.shape)


In [70]:
#pd.DataFrame(vector.toarray()).to_csv('balanced_count_vectorized_CONTENT.csv')

In [168]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

In [181]:
xstest_count =  count_vect.transform(Xs)

In [169]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

In [182]:
xstest_tfidf =  tfidf_vect.transform(Xs)

In [99]:
#tfidf_vect.vocabulary_

In [100]:
#tfidf_vect.idf_

In [170]:
# encode document
vector = tfidf_vect.transform(X)
# summarize encoded vector
#print(vector.shape)
#print(vector.toarray())

In [102]:
#pd.DataFrame(vector.toarray()).to_csv('balanced_tfidf_vectorized_CONTENT.csv')

In [183]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, feature_vector_suspended, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    
    # predict the labels on the suspension stock dataset
    suspended_prediction = classifier.predict(feature_vector_suspended)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    accuracy = metrics.accuracy_score(test_y, predictions)
    precision = metrics.precision_score(test_y, predictions, average="macro")
    recall = metrics.recall_score(test_y, predictions, average="macro")
    f1 = metrics.f1_score(test_y, predictions, average="macro")
    
    suspended_accuracy = metrics.accuracy_score(test_ys, suspended_prediction)
    suspended_precision = metrics.precision_score(test_ys, suspended_prediction, average="macro")
    suspended_recall = metrics.recall_score(test_ys, suspended_prediction, average="macro")
    suspended_f1 = metrics.f1_score(test_ys, suspended_prediction, average="macro")
    
    return accuracy,precision,recall,f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1

In [104]:
#evaluation_df = pd.DataFrame(columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])

In [187]:
# Naive Bayes on Count Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count, xstest_count)
#metrics = pd.DataFrame([['Naive Bayes', 'Count', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Naive Bayes, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

In [173]:
print("Naive Bayes, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

Naive Bayes, Count Vectors: 
 accuracy: 0.5524475524475524
 precision: 0.5510528923572401
 recall: 0.555023923444976
 f1:0.5523099816045086


In [191]:
print("SUSPENDED Naive Bayes, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(suspended_accuracy, suspended_precision, suspended_recall, suspended_f1))

SUSPENDED Naive Bayes, Count Vectors: 
 accuracy: 0.27837837837837837
 precision: 0.3681610291779784
 recall: 0.37493574641250804
 f1:(0.5454545454545454, 0.593192195931922, 0.574884180147338, 0.5318657127132637, 0.3891891891891892, 0.38559264258736586, 0.38795423716713073, 0.3089963515903971)


In [206]:
#metrics = pd.DataFrame([['Naive Bayes', 'Count', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
nb_count = np.array(['Naive Bayes', 'Count', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [207]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf)
#metrics = pd.DataFrame([['Naive Bayes', 'TF-IDF', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Naive Bayes, TF-IDF: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

In [208]:
#metrics = pd.DataFrame([['Naive Bayes', 'TF-IDF', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
nb_tfidf = np.array(['Naive Bayes', 'TF-IDF', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [209]:
# Linear Classifier on Count Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count, xstest_count)
#metrics = pd.DataFrame([['Logisitic Regression', 'Count', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("linear Classifier, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))



In [210]:
lr_count = np.array(['Logistic Regression', 'Count', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [211]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf)
#metrics = pd.DataFrame([['Logisitic Regression', 'TF-IDF', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("linear Classifier, TF-IDF: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

In [212]:
lr_tfidf = np.array(['Logistic Regression', 'TF-IDF', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [198]:
# Random Forest on Count Vectors
accuracy,precision,recall,f1, 
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count, xstest_count)
#metrics = pd.DataFrame([['Random Forest', 'Count', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Random Forest, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))



In [199]:
rf_count = np.array(['Random Forest', 'Count', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [200]:
# RF on Word Level TF IDF Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf)
#metrics = pd.DataFrame([['Random Forest', 'TF-IDF', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Random Forest, WordLevel TF-IDF: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))




In [201]:
rf_tfidf = np.array(['Random Forest', 'TF-IDF', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [202]:
# Extereme Gradient Boosting on Count Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xtest_count.tocsc(), xstest_count.tocsc())
#metrics = pd.DataFrame([['Extreme Gradient Boosting', 'Count', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Xgb, Count Vectors: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

In [203]:
xgb_count = np.array(['Extreme Gradient Boosting', 'Count', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [204]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy,precision,recall,f1,
suspended_accuracy, 
suspended_precision, 
suspended_recall,
suspended_f1 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xtest_tfidf.tocsc(), xstest_tfidf.tocsc())
#metrics = pd.DataFrame([['Extreme Gradient Boosting', 'TF-IDF', accuracy, precision, recall, f1]], columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])
#evaluation_df = evaluation_df.append(metrics)
#print("Xgb, WordLevel TF-IDF: \n accuracy: %s\n precision: %s\n recall: %s\n f1:%s"%(accuracy,precision,recall,f1))

In [205]:
xgb_tfidf = np.array(['Extreme Gradient Boosting', 'TF-IDF', accuracy, precision, recall, f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1])

In [213]:
metrics_all = np.array([nb_count, nb_tfidf, lr_count, lr_tfidf, rf_count, rf_tfidf, xgb_count, xgb_tfidf])

In [214]:
metrics_df = pd.DataFrame(metrics_all, columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1', 'suspended accuracy', 'suspended precision', 'suspended recall', 'suspended f1'])

In [126]:
#calculate precision and recall 
#remove sus stocks from list, then do senti anaysis on this 80 20 
#the second accuracy test 

In [129]:
pd.DataFrame(metrics_all, columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1'])

Unnamed: 0,classifier,feature vector,accuracy,precision,recall,f1
0,Naive Bayes,Count,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
1,Naive Bayes,TF-IDF,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
2,Logistic Regression,Count,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
3,Logistic Regression,TF-IDF,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
4,Random Forest,Count,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
5,Random Forest,TF-IDF,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
6,Extreme Gradient Boosting,Count,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959
7,Extreme Gradient Boosting,TF-IDF,0.4965034965034965,0.4946524064171123,0.4998860788334472,0.4959595959595959


In [None]:
metrics_df.to_csv('metrics_scores.csv')

In [215]:
metrics_df

Unnamed: 0,classifier,feature vector,accuracy,precision,recall,f1,suspended accuracy,suspended precision,suspended recall,suspended f1
0,Naive Bayes,Count,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5384615384615384, 0.5411802232854864, 0.557..."
1,Naive Bayes,TF-IDF,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5454545454545454, 0.593192195931922, 0.5748..."
2,Logistic Regression,Count,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.4965034965034965, 0.4946524064171123, 0.499..."
3,Logistic Regression,TF-IDF,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5594405594405595, 0.5727339181286549, 0.573..."
4,Random Forest,Count,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5104895104895105, 0.5157828282828283, 0.527..."
5,Random Forest,TF-IDF,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.48951048951048953, 0.5198160535117057, 0.50..."
6,Extreme Gradient Boosting,Count,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5664335664335665, 0.573721340388007, 0.5875..."
7,Extreme Gradient Boosting,TF-IDF,0.552448,0.551053,0.555024,0.55231,0.278378,0.368161,0.374936,"(0.5384615384615384, 0.5411802232854864, 0.557..."
