In [None]:
%load_ext autoreload
%autoreload 2
import gc
import sys
from datetime import datetime

gc.collect()

paths_to_add = ['/home/jovyan/work', '/home/jupyter/alpha_media_signal']

for p in paths_to_add:
    if p not in sys.path:
        sys.path.append(p)

import pandas as pd

%matplotlib inline

from pathlib import Path

from ams.config import constants, logger_factory
from ams.services import twitter_service
from ams.services import ticker_service

from ams.notebooks.twitter.twitter_ml_utils import  get_data_for_predictions
from ams.notebooks.twitter import twitter_ml_utils
from ams.utils import date_utils

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

logger = logger_factory.create(__name__)

In [None]:
overall_roi = None

df_rec_quart_drop = None
def get_quarterly_data():
    global df_rec_quart_drop
    if df_rec_quart_drop is None:
        df_rec_quart_drop = twitter_service.get_all_quarterly_data_for_twitter()
    return df_rec_quart_drop.copy()

overall_roi = []
# today_dt_str = date_utils.get_standard_ymd_format(datetime.now())
learning_prep_dir = Path(constants.TWITTER_GREAT_REDUCTION_DIR, "main")
df_tweets = twitter_ml_utils.load_twitter_raw(learning_prep_dir=learning_prep_dir)
# df_tweets = df_tweets.sample(frac=.25)

start_date = "2021-01-03"
days_after_start = 3
predict_date_str = twitter_ml_utils.get_next_market_date(start_date, days_after_start)
num_hold_days = 1

cat_uniques = None

dt = date_utils.parse_std_datestring(predict_date_str)
if date_utils.is_stock_market_closed(dt):
    logger.info("No can do. Market closed.")

logger.info(f"Filtering twitter data to on or before '{predict_date_str}'.")
df_tweets = df_tweets[df_tweets["date"] <= predict_date_str]

In [None]:
%%time
if df_tweets.shape[0] == 0:
    logger.info(f"No twitter data on {predict_date_str}")

twitter_ml_utils.show_distribution(df=df_tweets)

In [None]:
%%time
def convert_columns(df):
    df_booled = twitter_service.convert_to_bool(df=df)
    return twitter_ml_utils.convert_twitter_to_numeric(df=df_booled)

df_twitter = convert_columns(df=df_tweets)

In [None]:
%%time
def get_stocks_based_on_tweets(df, prediction_date_str, num_hold_days):
    df_stock_tweets = df[df["date"] < predict_date_str]
    df_stock_data = twitter_ml_utils.get_twitter_stock_data_2(df_tweets=df_stock_tweets,
                                                            num_hold_days=num_hold_days)

    prediction_tickers = list(df[df["date"] == predict_date_str]["f22_ticker"].unique())

    future_date = twitter_ml_utils.get_next_market_date(predict_date_str, num_hold_days)

    rows = []
    attributes = ("volume", "close")
    for t in prediction_tickers:
        prev_volume, prev_close = ticker_service.get_most_recent_stock_values(ticker=t, attributes=attributes)
        rows.append({"ticker": t, "date": predict_date_str, "future_date": future_date,
                     "prev_volume": prev_volume, "prev_close": prev_close
                    })

    df_stock_predict_data = pd.DataFrame(rows)

    return pd.concat([df_stock_data, df_stock_predict_data], axis=0)

df_sd_futured = get_stocks_based_on_tweets(df=df_twitter, prediction_date_str=prediction_date_str, num_hold_days=num_hold_days)

In [None]:
%%time
def combine_with_quarterly_stock_data(df):
    df_rec_quart_drop = get_quarterly_data()
    columns_fundy = list(df_rec_quart_drop.columns)
    df_result = twitter_ml_utils.merge_fundies_with_stock(df_stock_data=df)
    df_drop_init = df_result.dropna(subset=["date"]).drop(columns="lastupdated_eq_fun")
    df_drop_future = df_drop_init[df_drop_init["date"] > df_drop_init["calendardate"]]
    df_drop_future = df_drop_future.sort_values(by=["ticker", "date", "calendardate"], ascending=False)
    df_stock_and_quarter = df_drop_future.drop_duplicates(subset=["ticker", "date"], keep="first")
    logger.info("Finished merging in quarterly stock data.")
    
    return df_stock_and_quarter, columns_fundy

df_stock_and_quarter, columns_fundy = combine_with_quarterly_stock_data(df=df_sd_futured)

In [None]:
%%time
def merge_tweets_with_stock_data(df_twitter, df_stock_and_quarter):
    df_nas_tickers_info, cat_uniques = ticker_service.get_nasdaq_tickers(cat_uniques=cat_uniques)

    col_ticker = "ticker_drop"

    df_stock_quart_info = pd.merge(df_stock_and_quarter, df_nas_tickers_info, how='inner', left_on=["ticker"], right_on=[col_ticker])
    df_sqi = df_stock_quart_info.drop(columns=[col_ticker])

    df_stock_renamed = df_sqi.rename(columns={"ticker": "f22_ticker"})

    if 'None' in df_stock_renamed.columns:
        df_stock_renamed = df_stock_renamed.drop(columns=['None'])

    df_merged = pd.merge(df_twitter, df_stock_renamed, how='inner', left_on=["f22_ticker", "date"], right_on=["f22_ticker", "date"])

    if df_merged.shape[0] == 0:
        logger.info("Not enough data after merge.")
    
    return df_merged

df_merged = merge_tweets_with_stock_data(df_twitter=df_twitter, df_stock_and_quarter=df_stock_and_quarter)

In [None]:
df_days = twitter_ml_utils.add_days_since_quarter_results(df=df_merged)

df_days_of = twitter_ml_utils.add_calendar_days(df=df_days)

df_dd = twitter_ml_utils.add_nasdaq_roi_new(df=df_days_of, num_hold_days=num_hold_days)

# FIXME: 2021-01-15: chris.flesche: "close" should be approximated for when predicting
df_dd.loc[:, "original_close_price"] = df_dd["close"]
df_dd["date"].max()
logger.info(f'Num df_dd: {df_dd.shape[0]}')

# # NOTE: 2021-01-03: chris.flesche: For NLP
# # save_twitter_stock_join(df=df_thin_rabbit)

In [None]:
cols_fundy_numeric = list(set(columns_fundy) - {"ticker", 'calendardate', 'datekey', 'reportperiod'})

df_no_z = twitter_service.fill_null_numeric(df=df_dd, cols_fundy_numeric=cols_fundy_numeric)

df_since_sma = twitter_ml_utils.add_sma_stuff(df=df_no_z)

df_since_sma["purchase_date"] = df_since_sma["date"]

df_days_until = ticker_service.add_days_until_sale(df=df_since_sma)

# FIXME: 2021-01-14: chris.flesche: Use previous day's close for refine pool. Or remove.
df = twitter_service.refine_pool(df=df_days_until, min_volume=None, min_price=None, max_price=None)
df = twitter_service.omit_columns(df=df)
df_tweet_counted = twitter_service.add_tweet_count(df=df).drop(columns=["calendardate", "reportperiod", "dimension", "datekey"])

In [None]:
df_tmp = df_tweet_counted[df_tweet_counted["date"] == predict_date_str]
df_tmp[["f22_ticker", "original_close_price", "future_date", "future_close", "pe"]].head()

In [None]:
# NOTE: 2021-01-03: chris.flesche:
# df_winnowed = twitter_ml_utils.truncate_avail_columns(df=df_tweet_counted)

df_ranked = twitter_ml_utils.add_tip_ranks(df=df_tweet_counted, tr_file_path=constants.TIP_RANKED_DATA_PATH)

df_ticker_hotted, unique_tickers = ticker_service.make_f22_ticker_one_hotted(df_ranked=df_ranked, cat_uniques=cat_uniques)
cat_uniques["f22_ticker"] = unique_tickers

narrow_cols = list(df_ticker_hotted.columns)

print(f"Number of train_hotted {df_ticker_hotted.shape[0]}.")

dates = df_ticker_hotted["date"].to_list()
prediction_date_str = predict_date_str #dates[-1]

df_th_train = df_ticker_hotted[df_ticker_hotted["date"] < prediction_date_str]
df_train = twitter_service.add_buy_sell(df=df_th_train)

df_predict = df_ticker_hotted[df_ticker_hotted["date"] == prediction_date_str]

logger.info(f"Num rows of prepared data: {df_train.shape[0]}")
logger.info(f"Oldest date of prepared data (future_date): {df_train['future_date'].max()}")
logger.info(f"Num unique tickers: {len(cat_uniques['f22_ticker'])}")

In [None]:
import xgboost as xgb

X_train, y_train, standard_scaler = twitter_ml_utils.transform_to_numpy(df=df_train, narrow_cols=narrow_cols)

model = xgb.XGBClassifier(max_depth=4)
model.fit(X_train, y_train)

# df_predict_tmp = df_predict.copy()
# df_predict_tmp.loc[: "original_close_price"] == 100.00

X_predict = get_data_for_predictions(df=df_predict, narrow_cols=narrow_cols, standard_scaler=standard_scaler)

logger.info("Invoking model prediction ...")
prediction = model.predict(X_predict)

df_predict.loc[:, "prediction"] = prediction

In [None]:
df_buy = df_predict[df_predict["prediction"] == 1][["f22_ticker", "purchase_date", "future_date"]]
df_buy["num_hold_days"] = num_hold_days
df_buy["run_timestamp"] = datetime.timestamp(datetime.now())

print(len(df_train.columns))
print(df_train.shape[0])
print(df_predict.shape[0])
print(df_buy.shape[0])
df_predict[["f22_ticker", "prediction", "future_date"]].head(5)

In [None]:
from ams.machine_learning.twitter import pred_perf_testing

nhd = 1  #num_hold_days

days_roi_1 = pred_perf_testing.get_days_roi_from_prediction_table(df_preds=df_buy, date_str=prediction_date_str, num_hold_days=1)

print(f"Roi after 1 day: {days_roi_1}")

days_roi_5 = pred_perf_testing.get_days_roi_from_prediction_table(df_preds=df_buy, date_str=prediction_date_str, num_hold_days=5)

print(f"Roi after 5 days: {days_roi_5}")

In [None]:
for c in df_predict.columns:
    if "roi" in c:
        print(c)