In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [148]:
news_scraping_results = pd.read_csv("./news_scraping_results_and_label.csv")

In [149]:
news_scraping_results = news_scraping_results.dropna()

In [150]:
balanced_news = news_scraping_results[news_scraping_results['sentiment'] == 'positive'].reset_index()
balanced_news = balanced_news.append(news_scraping_results[news_scraping_results['sentiment'] == 'negative'].reset_index()[:218], ignore_index=True)
balanced_news = balanced_news.append(news_scraping_results[news_scraping_results['sentiment'] == 'neutral'].reset_index()[:218], ignore_index=True)

In [151]:
X, y = balanced_news[['CONTENT']].values.flatten().astype(str), balanced_news[['sentiment']].values.flatten().astype(str)

In [152]:
# split the dataset into training and validation datasets 
train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, test_size=0.1, random_state=42)

In [153]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [154]:
encode_y = encoder.fit_transform(y)

In [102]:
#encode_y.classes_

array(['negative', 'neutral', 'positive'], dtype='<U8')

In [131]:
#pd.DataFrame(encode_y, columns=['sentiment']).to_csv('encoded_labels.csv')

In [155]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [156]:
#count_vect.vocabulary_

In [157]:
vector = count_vect.transform(X)
# summarize encoded vector
#print(vector.shape)


In [135]:
#vector.toarray()

array([[2, 0, 0, ..., 0, 0, 0],
       [3, 0, 5, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [136]:
#pd.DataFrame(vector.toarray()).to_csv('count_vectorized_CONTENT.csv')

In [158]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_X)
xtest_count =  count_vect.transform(test_X)

In [159]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect.fit(X)
xtrain_tfidf =  tfidf_vect.transform(train_X)
xtest_tfidf =  tfidf_vect.transform(test_X)

In [139]:
#tfidf_vect.vocabulary_

{'three': 15696,
 'value': 16471,
 'invest': 9037,
 'etf': 6670,
 'worth': 17087,
 'consider': 4901,
 'now': 11417,
 'offer': 11533,
 'opportunity': 11656,
 'for': 7280,
 'exposure': 6855,
 'to': 15798,
 'small': 14526,
 'cap': 4032,
 'stock': 14977,
 'that': 15622,
 'have': 8086,
 'the': 15625,
 'potential': 12531,
 'outperform': 11786,
 'large': 9661,
 'and': 2526,
 'mid': 10632,
 'fund': 7483,
 'give': 7701,
 'investor': 9049,
 'way': 16819,
 'zero': 17286,
 'in': 8646,
 'on': 11596,
 'book': 3635,
 'price': 12660,
 'ratio': 13097,
 'earning': 6201,
 'sale': 13885,
 'first': 7144,
 'momentum': 10837,
 'with': 17027,
 'second': 14075,
 'one': 11609,
 'dividend': 5964,
 'third': 15673,
 'typically': 16131,
 'grow': 7903,
 'more': 10883,
 'slowly': 14518,
 'than': 15616,
 'average': 3036,
 'company': 4737,
 'but': 3901,
 'pron': 12789,
 'still': 14964,
 'can': 3999,
 'achieve': 2003,
 'strong': 15049,
 'return': 13578,
 'reduce': 13241,
 'risk': 13662,
 'compare': 4745,
 'similar': 144

In [140]:
#tfidf_vect.idf_

array([2.23236181, 3.19417579, 2.50102861, ..., 6.79148806, 6.79148806,
       6.79148806])

In [141]:
# encode document
vector = tfidf_vect.transform(X)
# summarize encoded vector
#print(vector.shape)
#print(vector.toarray())

(654, 17331)
[[0.03471316 0.         0.         ... 0.         0.         0.        ]
 [0.04646154 0.         0.08675538 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.01591054 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [142]:
#pd.DataFrame(vector.toarray()).to_csv('tfidf_vectorized_CONTENT.csv')

In [160]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y)

In [161]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)
accuracy

0.5909090909090909

In [162]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)
accuracy

0.5

In [163]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)
accuracy



0.6363636363636364

In [164]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
accuracy

0.5757575757575758

In [166]:
# Random Forest on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
print("Random Forest, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf)
print("Random Forest, WordLevel TF-IDF: ", accuracy)


RF, Count Vectors:  0.48484848484848486
RF, WordLevel TF-IDF:  0.45454545454545453




In [167]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xtest_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xtest_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

Xgb, Count Vectors:  0.5
Xgb, WordLevel TF-IDF:  0.5454545454545454
