In [22]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest,VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import export_graphviz, DecisionTreeClassifier, tree
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, accuracy_score

# Global Variables

In [4]:
stock_date_start = "2009-12-30"
stock_date_end = "2019-12-31"
target_stock = "AAPL"

# Read Data

In [15]:
stock_without_absolute = pd.read_pickle('./data/stock_without_absolute.pkl')
stock_with_absolute = pd.read_pickle('./data/stock_with_absolute.pkl')

label_abs_1d = pd.read_pickle('./data/label_abs_1d.pkl')
label_abs_7d = pd.read_pickle('./data/label_abs_7d.pkl')
label_abs_30d = pd.read_pickle('./data/label_abs_30d.pkl')

label_value_1d = pd.read_pickle('./data/label_value_1d.pkl')
label_value_7d = pd.read_pickle('./data/label_value_7d.pkl')
label_value_30d = pd.read_pickle('./data/label_value_30d.pkl')


# XGBClassifier

In [9]:
# stock_with_absolute, predict 1 day trend 
best_parameters = {'gamma': 0.8, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 100, "n_jobs": -1}
rfc = XGBClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/XGBoost/XGB_1d.pkl','wb'))

The Train Accuracy  0.733
The Test Accuracy   0.551


In [10]:
# stock_with_absolute, predict 7 day trend 
best_parameters = {'gamma': 0.8, 'max_depth': 20, 'min_child_weight': 3, 'n_estimators': 100, "n_jobs": -1}
rfc = XGBClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/XGBoost/XGB_7d.pkl','wb'))

The Train Accuracy  1.000
The Test Accuracy   0.765


In [11]:
# stock_with_absolute, predict 30 day trend 
best_parameters = {'gamma': 0.8, 'max_depth': 20, 'min_child_weight': 3, 'n_estimators': 100}
rfc = XGBClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/XGBoost/XGB_30d.pkl','wb'))

The Train Accuracy  1.000
The Test Accuracy   0.927


# RandomForest

In [17]:
# stock_with_absolute, predict 1 day trend 
best_parameters = {'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
rfc = RandomForestClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/RandomForest/RFC_1d.pkl','wb'))

The Train Accuracy  0.708
The Test Accuracy   0.575


In [18]:
# stock_with_absolute, predict 7 day trend 
best_parameters = {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 1000}
rfc = RandomForestClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/RandomForest/RFC_7d.pkl','wb'))

The Train Accuracy  1.000
The Test Accuracy   0.789


In [19]:
# stock_with_absolute, predict 30 day trend 
best_parameters = {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 100}
rfc = RandomForestClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/RandomForest/RFC_30d.pkl','wb'))

The Train Accuracy  1.000
The Test Accuracy   0.935


# DecisionTree

In [23]:
# stock_with_absolute, predict 1 day trend 
best_parameters = {'max_depth': 17, 'max_features': 2, 'min_samples_split': 30}
rfc = DecisionTreeClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/DecisionTree/DT_1d.pkl','wb'))

The Train Accuracy  0.753
The Test Accuracy   0.534


In [24]:
# stock_with_absolute, predict 7 day trend 
best_parameters = {'max_depth': 17, 'max_features': None, 'min_samples_split': 10}
rfc = DecisionTreeClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/DecisionTree/DT_7d.pkl','wb'))

The Train Accuracy  0.963
The Test Accuracy   0.692


In [25]:
# stock_with_absolute, predict 30 day trend 
best_parameters = {'max_depth': 11, 'max_features': None, 'min_samples_split': 20}
rfc = DecisionTreeClassifier(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/DecisionTree/DT_30d.pkl','wb'))

The Train Accuracy  0.950
The Test Accuracy   0.818


# LogisticRegression

In [26]:
# stock_with_absolute, predict 1 day trend 
best_parameters = {'solver': 'lbfgs'}
rfc = LogisticRegression(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/LogisticRegression/LR_1d.pkl','wb'))

The Train Accuracy  0.573
The Test Accuracy   0.555


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [27]:
# stock_with_absolute, predict 7 day trend 
best_parameters = {'solver': 'newton-cg'}
rfc = LogisticRegression(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/LogisticRegression/LR_7d.pkl','wb'))

The Train Accuracy  0.615
The Test Accuracy   0.583


In [28]:
# stock_with_absolute, predict 30 day trend 
best_parameters = {'solver': 'liblinear'}
rfc = LogisticRegression(**best_parameters)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc.fit(X_train, y_train)

predicted_test = rfc.predict(X_test)
predicted_train = rfc.predict(X_train)
decision_function = rfc.predict_proba(X_test)

acc_train = accuracy_score(y_train, predicted_train)
acc_test = accuracy_score(y_test, predicted_test)
print("The Train Accuracy  %0.3f" % (acc_train))
print("The Test Accuracy   %0.3f" % (acc_test ))
pickle.dump(rfc, open('./backend/AAPL/LogisticRegression/LR_30d.pkl','wb'))

The Train Accuracy  0.670
The Test Accuracy   0.729


# Load and Process Data

In [29]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

In [102]:
stock_date_start = "2020-03-01"
stock_raw = yf.download(target_stock, start=stock_date_start)
stock_raw_data = stock_raw
stock_raw_data['high_low_diff'] = (stock_raw_data['High'] - stock_raw_data['Low'])
stock_raw_data['open_close_diff'] = (stock_raw_data['Open'] - stock_raw_data['Close'])
stock_raw_data['high_low_diff_ratio'] = (stock_raw_data['High'] - stock_raw_data['Low']) / stock_raw_data['Close']
stock_raw_data['open_close_diff_ratio'] = (stock_raw_data['Open'] - stock_raw_data['Close']) / stock_raw_data['Close']
stock_stats_data = stockstats.StockDataFrame.retype(stock_raw_data)
stock_stats_data[['change', 'open_delta','close_delta','volume_delta', 'close_-2_r','close_-6_r', 'boll', 'boll_ub', 
                  'boll_lb', 'boll_-1_d', 'boll_ub_-1_d', 'boll_lb_-1_d' , 'kdjk','kdjd','kdjj', 'macd','macds',
                  'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr']]
stock_data = pd.DataFrame(stock_stats_data)
stock_data = stock_data.dropna()
stock_data['boll_k_diff'] = stock_data['boll'] - stock_data['close']

stock_with_absolute = stock_data[['change', 'open_delta','close_delta','volume_delta', 'high_low_diff_ratio', 
                                'open_close_diff_ratio','close_-2_r','close_-6_r','kdjk','kdjd','kdjj', 'macd',
                                'macds', 'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 
                                'boll_-1_d','boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff', 'high_low_diff', 
                                'open_close_diff', 'open', 'high', 'low', 'close', 'adj close', 'volume', ]]

print("Process data from: ", stock_with_absolute.index[0], " to ", stock_with_absolute.index[-1])
final_index = stock_with_absolute

SP500_raw = yf.download("^GSPC", start=stock_date_start)
SP500_raw_data = SP500_raw
SP500_raw_data['high_low_diff'] = (SP500_raw['High'] - SP500_raw['Low'])
SP500_raw_data['open_close_diff'] = (SP500_raw['Open'] - SP500_raw['Close'])
SP500_raw_data['high_low_diff_ratio'] = (SP500_raw['High'] - SP500_raw['Low']) / SP500_raw['Close']
SP500_raw_data['open_close_diff_ratio'] = (SP500_raw['Open'] - SP500_raw['Close']) / SP500_raw['Close']
SP500_stats_data = stockstats.StockDataFrame.retype(SP500_raw_data)
SP500_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

SP500_stock = pd.DataFrame(SP500_stats_data).add_prefix('sp500_')
stock_with_absolute = stock_with_absolute.join(SP500_stock.loc[stock_with_absolute.index])
stock_with_absolute

gold_raw = yf.download("GLD", start=stock_date_start)
gold_raw_data = gold_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
gold_raw_data['high_low_diff'] = (gold_raw['High'] - gold_raw['Low'])
gold_raw_data['open_close_diff'] = (gold_raw['Open'] - gold_raw['Close'])
gold_raw_data['high_low_diff_ratio'] = (gold_raw['High'] - gold_raw['Low']) / gold_raw['Close']
gold_raw_data['open_close_diff_ratio'] = (gold_raw['Open'] - gold_raw['Close']) / gold_raw['Close']

# Add financial indicators
gold_stats_data = stockstats.StockDataFrame.retype(gold_raw_data)
gold_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

gold_stock = pd.DataFrame(gold_stats_data).add_prefix('gold_')
stock_with_absolute = stock_with_absolute.join(gold_stock.loc[stock_with_absolute.index])

# 5 year bonds
y5bond_raw = yf.download("^FVX", start=stock_date_start)
y5bond_raw_data = y5bond_raw
y5bond_raw_data['high_low_diff'] = (y5bond_raw['High'] - y5bond_raw['Low'])
y5bond_raw_data['open_close_diff'] = (y5bond_raw['Open'] - y5bond_raw['Close'])
y5bond_raw_data['high_low_diff_ratio'] = (y5bond_raw['High'] - y5bond_raw['Low']) / y5bond_raw['Close']
y5bond_raw_data['open_close_diff_ratio'] = (y5bond_raw['Open'] - y5bond_raw['Close']) / y5bond_raw['Close']
y5bond_stats_data = stockstats.StockDataFrame.retype(y5bond_raw_data)
y5bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y5bond_stock = pd.DataFrame(y5bond_stats_data).drop(columns='volume').add_prefix('y5bond_')

# 10 year bonds
y10bond_raw = yf.download("^TNX", start=stock_date_start)
y10bond_raw_data = y10bond_raw
y10bond_raw_data['high_low_diff'] = (y10bond_raw['High'] - y10bond_raw['Low'])
y10bond_raw_data['open_close_diff'] = (y10bond_raw['Open'] - y10bond_raw['Close'])
y10bond_raw_data['high_low_diff_ratio'] = (y10bond_raw['High'] - y10bond_raw['Low']) / y10bond_raw['Close']
y10bond_raw_data['open_close_diff_ratio'] = (y10bond_raw['Open'] - y10bond_raw['Close']) / y10bond_raw['Close']
y10bond_stats_data = stockstats.StockDataFrame.retype(y10bond_raw_data)
y10bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y10bond_stock = pd.DataFrame(y10bond_stats_data).drop(columns='volume').add_prefix('y10bond_')
stock_with_absolute = pd.merge(stock_with_absolute, y10bond_stock, how="inner", left_index=True, right_index=True)
stock_with_absolute = pd.merge(stock_with_absolute, y5bond_stock, how="inner", left_index=True, right_index=True)
target = stock_with_absolute.tail(2)

[*********************100%***********************]  1 of 1 completed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
NOTE: Behavior of MACDH calculation has changed as of July 2017 - it is now 1/2 of previous calculated values



Process data from:  2020-03-10 00:00:00  to  2020-04-27 00:00:00
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [104]:
from pytz import timezone
tz = timezone('EST')
EST_Timezone = datetime.now(tz)
if(EST_Timezone.hour > 16 & EST_Timezone.minute > 30):
    target = target.tail(1)
else:
    target = target.head(1)
target










Unnamed: 0_level_0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y5bond_close,y5bond_adj close,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-24,2.886959,1.330017,7.940002,336700.0,0.021239,-0.020391,2.488227,-1.297569,60.684133,63.548061,...,0.364,0.364,0.023,0.005,0.063187,0.013736,-1.355012,-0.005,0.0,6.744869


Unnamed: 0_level_0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y5bond_close,y5bond_adj close,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-27,-0.565433,4.599976,-1.600006,-27187635.0,0.008707,0.001528,2.305202,-0.505655,63.79087,63.628997,...,0.386,0.386,0.008,-0.003,0.020725,-0.007772,6.043952,0.022,4.607043,6.043952


In [72]:
from bs4 import BeautifulSoup as bs
def get_news(symbol, time):
    USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
    headers={"user-agent" : USER_AGENT}
    target='AAPL'
    keyword = quote(symbol.encode('utf8'))
    total_search = {}
    target_param = {
        "tbm": "nws",
        "hl": "en",
        "lr": "lang_en",
        "q": keyword,
        "oq": keyword,
        "dcr": "0",
        "source": "lnt",
        "num": 5,
        "tbs": "cdr:1,cd_min:"+time+",cd_max:"+time,
    }
    url = "https://www.google.com.tw/search?" + urlencode(target_param)
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        content = res.content
        soup = bs(content, "html.parser")
        search_list = []
        items = soup.findAll("div", {"class": "g"})
        if items:
            for index, item in enumerate(items):
                    # title
                    news_title = item.find("h3", {"class": "r"}).find("a").text
                    # url
                    href = item.find("h3", {"class": "r"}).find("a").get("href")
                    news_link = href


                    # content
                    news_text = item.find("div", {"class": "st"}).text

                    # source
                    news_source = item.find("h3", {"class": "r"}).findNext('div').text.split('-')
                    news_from = news_source[0]
                    time_created = str(news_source[1])

                    # add item into json object
                    search_list.append({
                        "news_title": news_title,
                        "news_link": news_link,
                        "news_text": news_text,
                        "news_from": news_from,
                        "time_created": time_created
                    })
            total_search[target_date] = search_list
    else:
        print('error at '+str(i))
    return total_search

In [73]:
temp = get_news("AAPL")

In [74]:
temp

{'04/27/2020': [{'news_title': 'Apple Inc. (AAPL) future in Consumer Goods Sector',
   'news_link': 'https://newsheater.com/2020/04/27/apple-inc-aapl-future-in-consumer-goods-sector/',
   'news_text': '(NASDAQ: AAPL) scored price to earnings ratio above its average ratio, recording 22.35 times of increase in earnings at the present. AAPL Market Performance.',
   'news_from': 'The News Heater',
   'time_created': '1 hour ago'},
  {'news_title': 'Analysts Offer Insights on Technology Companies: Mitek ...',
   'news_link': 'https://www.smarteranalyst.com/new-blurbs/analysts-offer-insights-on-technology-companies-mitek-systems-mitk-and-apple-aapl/',
   'news_text': "Apple (AAPL). Monness analyst Brian White maintained a Buy rating on Apple today and set a price target of $370.00. The company's shares closed last Friday\xa0...",
   'news_from': 'Smarter Analyst',
   'time_created': '53 minutes ago'},
  {'news_title': 'Apple delays 5G iPhone production - WSJ',
   'news_link': 'https://seekin

In [None]:
# -*- coding: utf-8 -*-
time_index = list(financial_data.index)
score = {}
des_score = {}
for time in time_index:
    yesterday = (time - timedelta(days=1)).strftime("%m/%d/%Y")
    if(yesterday in data):
        num_news = len(data[yesterday])
        sentiment = 0
        des_sentiment = 0
        #print(yesterday, ', ', num_news)
        for news in data[yesterday]:
            news_title = news['news_title'].replace('...', '')
            news_des = news['news_text'].encode("ascii", "ignore").decode("ascii").replace('...', '')
            blob = TextBlob(news_title)
            des_blob = TextBlob(news_des)
            sentiment += blob.sentiment.polarity
            des_sentiment += des_blob.sentiment.polarity
            #print('Title: ', news_title, ', ',blob.sentiment.polarity)
            #print('Description: ', news_des, ', ', des_blob.sentiment.polarity)
        score[time] = sentiment / num_news
        des_score[time] = des_sentiment / num_news
        #print()
    else:
        score[time] = 0.0
        des_score[time] = 0.0

In [61]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.parse import quote, parse_qs, urlparse, urlencode
import time

USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
headers={"user-agent" : USER_AGENT}
target='AAPL'
keyword = quote(target.encode('utf8'))
total_search = {}
start_time = time.time()

i = datetime.now()
time.sleep(0.5)
target_date = i.strftime("%m/%d/%Y")
target_param = {
    "tbm": "nws",
    "hl": "en",
    "lr": "lang_en",
    "q": keyword,
    "oq": keyword,
    "dcr": "0",
    "source": "lnt",
    "num": 5,
    "tbs": "cdr:1,cd_min:"+target_date+",cd_max:"+target_date,
}
url = "https://www.google.com.tw/search?" + urlencode(target_param)
res = requests.get(url, headers=headers)
if res.status_code == 200:
    content = res.content
    soup = BeautifulSoup(content, "html.parser")
    search_list = []
    items = soup.findAll("div", {"class": "g"})
    if items:
        for index, item in enumerate(items):
                # title
                news_title = item.find("h3", {"class": "r"}).find("a").text
                # url
                href = item.find("h3", {"class": "r"}).find("a").get("href")
                news_link = href


                # content
                news_text = item.find("div", {"class": "st"}).text

                # source
                news_source = item.find("h3", {"class": "r"}).findNext('div').text.split('-')
                news_from = news_source[0]
                time_created = str(news_source[1])

                # add item into json object
                search_list.append({
                    "news_title": news_title,
                    "news_link": news_link,
                    "news_text": news_text,
                    "news_from": news_from,
                    "time_created": time_created
                })
        total_search[target_date] = search_list
else:
    print('error at '+str(i))

In [62]:
total_search

{'04/27/2020': [{'news_title': 'Apple Inc. (AAPL) future in Consumer Goods Sector',
   'news_link': 'https://newsheater.com/2020/04/27/apple-inc-aapl-future-in-consumer-goods-sector/',
   'news_text': '(NASDAQ: AAPL) scored price to earnings ratio above its average ratio, recording 22.35 times of increase in earnings at the present. AAPL Market Performance.',
   'news_from': 'The News Heater',
   'time_created': '1 hour ago'},
  {'news_title': 'Analysts Offer Insights on Technology Companies: Mitek ...',
   'news_link': 'https://www.smarteranalyst.com/new-blurbs/analysts-offer-insights-on-technology-companies-mitek-systems-mitk-and-apple-aapl/',
   'news_text': "Apple (AAPL). Monness analyst Brian White maintained a Buy rating on Apple today and set a price target of $370.00. The company's shares closed last Friday\xa0...",
   'news_from': 'Smarter Analyst',
   'time_created': '34 minutes ago'},
  {'news_title': 'Apple delays 5G iPhone production - WSJ',
   'news_link': 'https://seekin