In [1]:
%load_ext autoreload
%autoreload 2
import sys

paths_to_add = ['/home/jovyan/work', '/home/jupyter/alpha_media_signal']

for p in paths_to_add:
    if p not in sys.path:
        sys.path.append(p)
        
from datetime import datetime
import gc
import pandas as pd
from pathlib import Path
from ams.config import constants, logger_factory
from ams.services import twitter_service
from ams.services import ticker_service
from ams.notebooks.twitter.twitter_ml_utils import  get_data_for_predictions
from ams.notebooks.twitter import twitter_ml_utils
from ams.utils import date_utils
import xgboost as xgb
from ams.machine_learning.twitter import pred_perf_testing
from datetime import timedelta
from typing import Dict

%matplotlib inline

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

logger = logger_factory.create(__name__)

gc.collect()

4

In [2]:
def is_good_date(dt):
    result = True
    if date_utils.is_stock_market_closed(dt):
        logger.info("No can do. Market closed.")
        result = False
    return result

def get_stock_matchable(df):
    tickers = list(set(df["f22_ticker"].to_list()))
    
    good_tickers = []
    for t in tickers:
        if ticker_service.does_ticker_data_exist(ticker=t):
            good_tickers.append(t)
    
    return df[df["f22_ticker"].isin(good_tickers)]

In [3]:
%%time
overall_roi = []
cat_uniques = None
overall_roi = None
# start_date = date_utils.get_standard_ymd_format(datetime.now())
learning_prep_dir = Path(constants.TWITTER_GREAT_REDUCTION_DIR, "main")
df_tweets = twitter_ml_utils.load_twitter_raw(learning_prep_dir=learning_prep_dir)
df_tweets_joinable = get_stock_matchable(df=df_tweets)

Wall time: 1.06 s


In [4]:
%%time
# twitter_ml_utils.show_distribution(df=df_tweets_joinable)

Wall time: 0 ns


In [5]:
%%time
def convert_columns(df):
    df_booled = twitter_service.convert_to_bool(df=df)
    return twitter_ml_utils.convert_twitter_to_numeric(df=df_booled)

# df_twitter = convert_columns(df=df_tweets_joinable)

Wall time: 0 ns


In [6]:
%%time
def get_stocks_based_on_tweets(df, prediction_date_str, num_hold_days):
    df_stock_tweets = df[df["date"] < predict_date_str]
    df_stock_data = twitter_ml_utils.get_twitter_stock_data_2(df_tweets=df_stock_tweets,
                                                            num_hold_days=num_hold_days)

    prediction_tickers = list(df[df["date"] == predict_date_str]["f22_ticker"].unique())

    future_date = twitter_ml_utils.get_next_market_date(predict_date_str, num_hold_days)

    rows = []
    attributes = ("volume", "close")
    for t in prediction_tickers:
        prev_volume, prev_close = ticker_service.get_most_recent_stock_values(ticker=t, attributes=attributes)
        rows.append({"ticker": t, "date": predict_date_str, "future_date": future_date,
                     "prev_volume": prev_volume, "prev_close": prev_close
                    })

    df_stock_predict_data = pd.DataFrame(rows)

    return pd.concat([df_stock_data, df_stock_predict_data], axis=0)

# df_sd_futured = get_stocks_based_on_tweets(df=df_twitter, prediction_date_str=predict_date_str, num_hold_days=num_hold_days)

Wall time: 0 ns


In [7]:
%%time
def get_quarterly_data():
    global df_rec_quart_drop
    if df_rec_quart_drop is None:
        df_rec_quart_drop = twitter_service.get_all_quarterly_data_for_twitter()
    return df_rec_quart_drop.copy()

def combine_with_quarterly_stock_data(df):
    df_rec_quart_drop = get_quarterly_data()
    columns_fundy = list(df_rec_quart_drop.columns)
    df_result = twitter_ml_utils.merge_fundies_with_stock(df_stock_data=df)
    df_drop_init = df_result.dropna(subset=["date"]).copy().drop(columns="lastupdated_eq_fun").copy()
    df_drop_future = df_drop_init[df_drop_init["date"] > df_drop_init["calendardate"]].copy()
    df_drop_future = df_drop_future.sort_values(by=["ticker", "date", "calendardate"], ascending=False).copy()
    df_stock_and_quarter = df_drop_future.drop_duplicates(subset=["ticker", "date"], keep="first").copy()
    logger.info("Finished merging in quarterly stock data.")
    
    return df_stock_and_quarter, columns_fundy

# df_stock_and_quarter, columns_fundy = combine_with_quarterly_stock_data(df=df_sd_futured)

Wall time: 0 ns


In [8]:
%%time
def merge_tweets_with_stock_data(df_twitter, df_stock_and_quarter, cat_uniques):
    df_nas_tickers_info, cat_uniques = ticker_service.get_nasdaq_tickers(cat_uniques=cat_uniques)

    col_ticker = "ticker_drop"

    df_stock_quart_info = pd.merge(df_stock_and_quarter, df_nas_tickers_info, how='inner', left_on=["ticker"], right_on=[col_ticker])
    df_sqi = df_stock_quart_info.drop(columns=[col_ticker])

    df_stock_renamed = df_sqi.rename(columns={"ticker": "f22_ticker"})

    if 'None' in df_stock_renamed.columns:
        df_stock_renamed = df_stock_renamed.drop(columns=['None'])

    df_merged = pd.merge(df_twitter, df_stock_renamed, how='inner', left_on=["f22_ticker", "date"], right_on=["f22_ticker", "date"])

    if df_merged.shape[0] == 0:
        logger.info("Not enough data after merge.")
        
    df_ranked = twitter_ml_utils.add_tip_ranks(df=df_merged, tr_file_path=constants.TIP_RANKED_DATA_PATH)
    
    return df_ranked, cat_uniques

# df_merged, cat_uniques = merge_tweets_with_stock_data(df_twitter=df_twitter, df_stock_and_quarter=df_stock_and_quarter, cat_uniques=cat_uniques)

Wall time: 0 ns


In [9]:
%%time

def supplement_predict(df, predict_date_str: str):
    df_train = df[df["date"] != predict_date_str].copy()
    df_predict = df[df["date"] == predict_date_str].copy()
        
    df_predict = ticker_service.get_equity_on_prev_trading_day(df=df_predict, date_str=predict_date_str)
    df_predict.loc[:, "open"] = df_predict["prev_open"]
    df_predict.loc[:, "low"] = df_predict["prev_low"]
    df_predict.loc[:, "high"] = df_predict["prev_high"]
    df_predict.loc[:, "close"] = df_predict["prev_close"]
    df_predict.loc[:, "original_close_price"] = df_predict["prev_close"]
    
    df_combined = pd.concat([df_train, df_predict], axis=0).reset_index(drop=True)
    
    return df_combined

# df_supple = supplement_predict(df=df_merged, predict_date_str=predict_date_str)

Wall time: 0 ns


In [10]:
%%time
def add_calendar_info(df, predict_date_str, columns_fundy):
    cols_fundy_numeric = list(set(columns_fundy) - {"ticker", 'calendardate', 'datekey', 'reportperiod'})

    df_days = twitter_ml_utils.add_days_since_quarter_results(df=df)

    df_days_of = twitter_ml_utils.add_calendar_days(df=df_days)

    df_dd = twitter_ml_utils.add_nasdaq_roi_new(df=df_days_of, num_hold_days=num_hold_days)

    # FIXME: 2021-01-15: chris.flesche: "close" should be approximated for when predicting (?)
    df_dd.loc[:, "original_close_price"] = df_dd["close"]

    # # NOTE: 2021-01-03: chris.flesche: For NLP
    # # save_twitter_stock_join(df=df_thin_rabbit)

    df_no_z = twitter_service.fill_null_numeric(df=df_dd, cols_fundy_numeric=cols_fundy_numeric)
    
    df_since_sma = twitter_ml_utils.add_sma_stuff(df=df_no_z, predict_date_str=predict_date_str)
    
    df_since_sma.loc[:, "purchase_date"] = df_since_sma["date"]

    df_days_until = ticker_service.add_days_until_sale(df=df_since_sma)
    
    return df_days_until

# df_days_until = add_calendar_info(df=df_supple, predict_date_str=predict_date_str, columns_fundy=columns_fundy)

Wall time: 0 ns


In [11]:
%%time
def refine_and_drop_cols(df):
    df = twitter_service.refine_pool(df=df, min_volume=None, min_price=None, max_price=None)
    df = twitter_service.omit_columns(df=df)
    return df.drop(columns=["calendardate", "reportperiod", "dimension", "datekey"])

# df_refined = refine_and_drop_cols(df_days_until)

Wall time: 0 ns


In [12]:
# NOTE: 2021-01-03: chris.flesche:
# df_winnowed = twitter_ml_utils.truncate_avail_columns(df=df_tweet_counted)
def one_hot(df, cat_uniques):
    df_ticker_hotted, unique_tickers = ticker_service.make_f22_ticker_one_hotted(df_ranked=df, cat_uniques=cat_uniques)
    cat_uniques["f22_ticker"] = unique_tickers

    narrow_cols = list(df_ticker_hotted.columns)

    print(f"Number of train_hotted {df_ticker_hotted.shape[0]}.")
    
    return df_ticker_hotted, narrow_cols, cat_uniques

# df_ticker_hotted, narrow_cols, cat_uniques = one_hot(df=df_refined, cat_uniques=cat_uniques)

In [13]:
%%time
def train_predict(df_train, df_predict, narrow_cols):
    import warnings
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide")
        X_train, y_train, standard_scaler = twitter_ml_utils.transform_to_numpy(df=df_train, narrow_cols=narrow_cols)

        model = xgb.XGBClassifier(max_depth=4)
        model.fit(X_train, y_train)

    X_predict = get_data_for_predictions(df=df_predict, narrow_cols=narrow_cols, standard_scaler=standard_scaler)

    logger.info("Invoking model prediction ...")
    prediction = model.predict(X_predict)

    df_predict.loc[:, "prediction"] = prediction
    
    return df_predict

# df_predict = train_predict(df_train=df_train, df_predict=df_predict)
# df_null = df_train.isnull().sum().to_frame('nulls')
# df_null[df_null["nulls"] > 0].head(5000)

# for nc in narrow_cols:
#     if not nc.startswith("industry_") \
#     and not nc.startswith("famaindustry_") \
#     and not nc.startswith("industry_") \
#     and not nc.startswith("sector_") \
#     and not nc.startswith("sicsector_") \
#     and not nc.startswith("currency_") \
#     and not nc.startswith("f22_ticker_") \
#     and not nc.startswith("scalerevenue_") \
#     and not nc.startswith("scalemarketcap_") \
#     and not nc.startswith("location_") \
#     and "date" in nc:
#         print(nc)
    

Wall time: 0 ns


In [14]:
%%time

def persist_predictions(df_buy, predict_date_str: str, num_hold_days: int):
    overwrite_file = False
    df_preds = pd.read_csv(constants.TWITTER_PREDICTIONS_PATH)
    df_preds = df_preds[~((df_preds["purchase_date"] == predict_date_str) & (df_preds["num_hold_days"] == num_hold_days))]

    logger.info(f"Old rows found: {df_preds.shape[0]}")

    if overwrite_file:
        df_combined = df_buy
    else:
        df_combined = pd.concat([df_preds, df_buy], axis=0)

    logger.info("Writing predictions to output ...")
    df_combined.to_csv(constants.TWITTER_PREDICTIONS_PATH, index=False)

def show_prediction_results(df_predict, predict_date_str, num_hold_days):
    df_buy = df_predict[df_predict["prediction"] == 1][["f22_ticker", "purchase_date", "future_date"]]
    df_buy["num_hold_days"] = num_hold_days
    df_buy["run_timestamp"] = datetime.timestamp(datetime.now())
    
    persist_predictions(df_buy=df_buy, predict_date_str=predict_date_str, num_hold_days=num_hold_days)

    days_roi_1 = pred_perf_testing.get_days_roi_from_prediction_table(df_preds=df_buy, date_str=predict_date_str, num_hold_days=1)

    days_roi_5 = pred_perf_testing.get_days_roi_from_prediction_table(df_preds=df_buy, date_str=predict_date_str, num_hold_days=5)

    return days_roi_1, days_roi_5
 
# roi_1_day, roi_5_days = show_prediction_results(df_predict, predict_date_str)

Wall time: 0 ns


In [15]:
def predict_day(df, predict_date_str: str, cat_uniques: Dict, num_hold_days: int):
    df_twitter = convert_columns(df=df)
    
    df_sd_futured = get_stocks_based_on_tweets(df=df_twitter, prediction_date_str=predict_date_str, num_hold_days=num_hold_days)
    df_stock_and_quarter, columns_fundy = combine_with_quarterly_stock_data(df=df_sd_futured)
    df_merged, cat_uniques = merge_tweets_with_stock_data(df_twitter=df_twitter, df_stock_and_quarter=df_stock_and_quarter, cat_uniques=cat_uniques)
    
    df_days_until = add_calendar_info(df=df_merged, predict_date_str=predict_date_str, columns_fundy=columns_fundy)
    
    df_refined = refine_and_drop_cols(df_days_until)
        
    df_ticker_hotted, narrow_cols, cat_uniques = one_hot(df=df_refined, cat_uniques=cat_uniques)
    
    if df_ticker_hotted is None or df_ticker_hotted.shape[0] == 0:
        logger.info(f"Not enough data on {predict_date_str}")
        return
    
    df_supple = supplement_predict(df=df_ticker_hotted, predict_date_str=predict_date_str)
    
    df_train, df_predict = twitter_ml_utils.split_train_predict(df=df_supple, predict_date_str=predict_date_str)
    
    if df_train is None or df_predict is None or df_predict.shape[0] == 0 or df_train.shape[0] == 0:
        logger.info(f"Not enough data on {predict_date_str}")
        return

    df_predict = train_predict(df_train=df_train, df_predict=df_predict, narrow_cols=narrow_cols)
    
    roi_1_day, roi_5_days = show_prediction_results(df_predict, predict_date_str, num_hold_days)
    print(f"Roi 1: {roi_1_day}: 5: {roi_5_days}")
    
    return roi_1_day

In [19]:
%%time
from statistics import mean

# df_tweets_joinable = df_tweets_joinable.sample(frac=.15)

start_date = "2020-08-27"
dt_start = date_utils.parse_std_datestring(start_date)
predict_date_str = date_utils.get_standard_ymd_format(dt_start)

num_hold_days = 1
df_rec_quart_drop = None

min_date_str = "2020-08-10"
max_date_str = "2021-01-18"

all_roi = []
while min_date_str <= predict_date_str <= max_date_str:
    while not is_good_date(dt=dt_start):
        dt_start = dt_start + timedelta(days=1)
        
    predict_date_str = date_utils.get_standard_ymd_format(dt_start)
    
    df_tweets_for_day = df_tweets_joinable[df_tweets_joinable["date"] <= predict_date_str]
    
    df_predict = df_tweets_joinable[df_tweets_joinable["date"] == predict_date_str]
    
    if df_predict.shape[0] > 0:
        cat_uniques = None
        roi_day = predict_day(df=df_tweets_for_day, predict_date_str=predict_date_str, cat_uniques=cat_uniques, num_hold_days=num_hold_days)
        all_roi.append(roi_day)
    
    dt_start = dt_start + timedelta(days=1)

print(f"Overall roi: {mean(all_roi)}")

2021-01-19 07:49:39,265 - __main__ - INFO - Finished merging in quarterly stock data.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Num rows in play: 32676
cat_uniques is not used.
Number of train_hotted 16334.
16334
16334
2021-01-19 07:50:57,204 - ams.notebooks.twitter.twitter_ml_utils - INFO - Num rows of prepared data: 16308
2021-01-19 07:50:57,208 - ams.notebooks.twitter.twitter_ml_utils - INFO - Oldest date of prepared data (future_date): 2020-08-27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


16308
26


  result = op(x, *args, **kwargs)


2021-01-19 07:51:09,672 - __main__ - INFO - Invoking model prediction ...
2021-01-19 07:51:09,793 - __main__ - INFO - Old rows found: 62039
2021-01-19 07:51:09,799 - __main__ - INFO - Writing predictions to output ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


2020-08-27: roi: 0.012654213764949795
2020-08-27: roi: -0.05346473370521651
Roi 1: 0.012654213764949795: 5: -0.05346473370521651
2021-01-19 07:51:49,209 - __main__ - INFO - Finished merging in quarterly stock data.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Num rows in play: 32738
cat_uniques is not used.
Number of train_hotted 16364.
16364
16364
2021-01-19 07:53:07,300 - ams.notebooks.twitter.twitter_ml_utils - INFO - Num rows of prepared data: 16334
2021-01-19 07:53:07,305 - ams.notebooks.twitter.twitter_ml_utils - INFO - Oldest date of prepared data (future_date): 2020-08-28


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


16334
30


  result = op(x, *args, **kwargs)


2021-01-19 07:53:19,856 - __main__ - INFO - Invoking model prediction ...
2021-01-19 07:53:19,985 - __main__ - INFO - Old rows found: 62025
2021-01-19 07:53:19,991 - __main__ - INFO - Writing predictions to output ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


2020-08-28: roi: -0.010981050488607189
2020-08-28: roi: -0.006412060441801362
Roi 1: -0.010981050488607189: 5: -0.006412060441801362
2021-01-19 07:53:20,487 - __main__ - INFO - No can do. Market closed.
2021-01-19 07:53:20,488 - __main__ - INFO - No can do. Market closed.


KeyboardInterrupt: 

In [17]:
# Add overall ROI ongoing.(?)
# Add EOD (open, close, high, low) to train and predict. But for predict use previous day. Done
# Fix SMA: prev day for open for predict, regular for all other dates
# Add num days from start
# Add lookup to previous day's prediction roi - seems to follow on-off-on-off pattern.
# Change to WorldTradingDaily real time quotes (12hr) to substitute for open, low, high, and estimate close. (or just take current)
# WTD not necessary when using historical.
# Test yesterday EOD with 4 day estimate.
# Test with historical purchase day eod data (open, close, high, low)
# Change SMA back to use purchase day-base SMA.
# Reprocess all date with by using sums in Great reduction rather than means.