In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
news_scraping_results = pd.read_csv("./news_scraping_results_and_label.csv")

In [3]:
news_scraping_results = news_scraping_results.dropna(subset=['sentiment'])

In [4]:
tickers_list = pd.read_csv("./tickers_list.csv")

In [5]:
suspended_ticker_news = news_scraping_results[news_scraping_results['SYMBOL'].isin(tickers_list['Symbol'].values)]

In [6]:
active_ticker_news = news_scraping_results[~news_scraping_results['SYMBOL'].isin(tickers_list['Symbol'].values)]

In [7]:
active_ticker_news.index.values

array([   0,    1,    2, ..., 2571, 2572, 2573])

In [8]:
balanced_news = active_ticker_news[active_ticker_news['sentiment'] == 'positive'].reset_index()
balanced_news = balanced_news.append(active_ticker_news[active_ticker_news['sentiment'] == 'negative'].reset_index()[:237], ignore_index=True)
balanced_news = balanced_news.append(active_ticker_news[active_ticker_news['sentiment'] == 'neutral'].reset_index()[:237], ignore_index=True)

In [9]:
suspended_balanced_news = balanced_news.append(suspended_ticker_news.reset_index(), ignore_index=True)

In [10]:
# whole dataset with suspended and balanced active stock
X, y = suspended_balanced_news[['CONTENT']].values.flatten().astype(str), suspended_balanced_news[['sentiment']].values.flatten().astype(str)

In [11]:
# dataset without suspended stock 
Xb, yb = balanced_news[['CONTENT']].values.flatten().astype(str), balanced_news[['sentiment']].values.flatten().astype(str)

In [12]:
Xs, ys = suspended_ticker_news[['CONTENT']].values.flatten().astype(str), suspended_ticker_news[['sentiment']].values.flatten().astype(str)

In [13]:
# split the dataset into training and validation datasets 
train_X, test_X, train_y, test_y = model_selection.train_test_split(Xb, yb, test_size=0.2, random_state=42)

In [14]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [15]:
test_ys = encoder.fit_transform(ys)

In [50]:
encode_y = encoder.fit_transform(y)

In [92]:
#encode_y.classes_

In [93]:
#pd.DataFrame(encode_y, columns=['sentiment']).to_csv('balanced_encoded_labels.csv')

In [16]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [52]:
#count_vect.vocabulary_

In [53]:
vector = count_vect.transform(X)
# summarize encoded vector
#print(vector.shape)


In [54]:
#pd.DataFrame(vector.toarray()).to_csv('balanced_count_vectorized_CONTENT.csv')

In [17]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

In [18]:
xstest_count =  count_vect.transform(Xs)

In [19]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

In [20]:
xstest_tfidf =  tfidf_vect.transform(Xs)

In [99]:
#tfidf_vect.vocabulary_

In [100]:
#tfidf_vect.idf_

In [71]:
# encode document
vector = tfidf_vect.transform(X)
# summarize encoded vector
#print(vector.shape)
#print(vector.toarray())

In [102]:
#pd.DataFrame(vector.toarray()).to_csv('balanced_tfidf_vectorized_CONTENT.csv')

In [76]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, feature_vector_suspended, is_neural_net=False, classifier_name=None, feature_vector_name=None):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    
    # predict the labels on the suspension stock dataset
    suspended_prediction = classifier.predict(feature_vector_suspended)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    accuracy = metrics.accuracy_score(test_y, predictions)
    precision = metrics.precision_score(test_y, predictions, average="macro")
    recall = metrics.recall_score(test_y, predictions, average="macro")
    f1 = metrics.f1_score(test_y, predictions, average="macro")
    
    
    suspended_accuracy = metrics.accuracy_score(test_ys, suspended_prediction)
    suspended_precision = metrics.precision_score(test_ys, suspended_prediction, average="macro")
    suspended_recall = metrics.recall_score(test_ys, suspended_prediction, average="macro")
    suspended_f1 = metrics.f1_score(test_ys, suspended_prediction, average="macro")
    
    df = pd.DataFrame([[classifier_name, feature_vector_name, accuracy,precision,recall,f1, suspended_accuracy, suspended_precision, suspended_recall, suspended_f1]], columns=['classifier','feature vector','accuracy','precision','recall','f1', 'suspended_accuracy', 'suspended_precision', 'suspended_recall', 'suspended_f1'])
    
    return df

In [109]:
metrics_df = pd.DataFrame(columns=['classifier', 'feature vector', 'accuracy', 'precision', 'recall', 'f1', 'suspended_accuracy', 'suspended_precision', 'suspended_recall', 'suspended_f1'])

In [110]:
# Naive Bayes on Count Vectors
scores = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count, xstest_count, classifier_name='Naive Bayes', feature_vector_name='Count')
metrics_df = metrics_df.append(scores, ignore_index=True)

In [112]:
# Naive Bayes on TF-IDF Vectors
scores = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf, classifier_name='Naive Bayes', feature_vector_name='TF-IDF')
metrics_df = metrics_df.append(scores, ignore_index=True)

In [114]:
# Logistic Regression on Count Vectors
scores = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count, xstest_count, classifier_name='Logistic Regression', feature_vector_name='Count')
metrics_df = metrics_df.append(scores, ignore_index=True)



In [115]:
# Logistic Regression on TF-IDF Vectors
scores = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf, classifier_name='Logistic Regression', feature_vector_name='TF-IDF')
metrics_df = metrics_df.append(scores, ignore_index=True)

In [116]:
# Random Forest on Count Vectors
scores = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count, xstest_count, classifier_name='Random Forest', feature_vector_name='Count')
metrics_df = metrics_df.append(scores, ignore_index=True)



In [117]:
# Random Forest on TF-IDF Vectors
scores = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf, xstest_tfidf, classifier_name='Random Forest', feature_vector_name='TF-IDF')
metrics_df = metrics_df.append(scores, ignore_index=True)



In [118]:
# Extreme Boosting Gradient on Count Vectors
scores = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xtest_count.tocsc(), xstest_count.tocsc(), classifier_name='Extreme Boosting Gradient', feature_vector_name='Count')
metrics_df = metrics_df.append(scores, ignore_index=True)

In [119]:
# Extreme Boosting Gradient on TF-IDF Vectors
scores = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xtest_tfidf.tocsc(), xstest_tfidf.tocsc(), classifier_name='Extreme Gradient Boosting', feature_vector_name='TF-IDF')
metrics_df = metrics_df.append(scores, ignore_index=True)

In [124]:
metrics_df

Unnamed: 0,classifier,feature vector,accuracy,precision,recall,f1,suspended_accuracy,suspended_precision,suspended_recall,suspended_f1
0,Naive Bayes,Count,0.552448,0.5522,0.554663,0.552751,0.275676,0.366836,0.372062,0.264946
1,Naive Bayes,TF-IDF,0.573427,0.625708,0.604181,0.559187,0.372973,0.354581,0.367763,0.2951
2,Logistic Regression,Count,0.496503,0.494652,0.499886,0.49596,0.327027,0.363528,0.364584,0.307035
3,Logistic Regression,TF-IDF,0.552448,0.563874,0.565751,0.555093,0.281081,0.322197,0.324936,0.247383
4,Random Forest,Count,0.48951,0.500369,0.515208,0.484711,0.332432,0.350679,0.363959,0.318469
5,Random Forest,TF-IDF,0.482517,0.504972,0.498196,0.485387,0.345946,0.370787,0.359918,0.328673
6,Extreme Boosting Gradient,Count,0.566434,0.573721,0.587529,0.565458,0.359459,0.383909,0.390571,0.337581
7,Extreme Gradient Boosting,TF-IDF,0.545455,0.549441,0.565448,0.544438,0.375676,0.411842,0.397664,0.350537


In [125]:
metrics_df.to_csv('metrics_scores.csv')