In [None]:
%load_ext autoreload
%autoreload 2
import gc
import sys

gc.collect()

paths_to_add = ['/home/jovyan/work', '/home/jupyter/alpha_media_signal']

for p in paths_to_add:
    if p not in sys.path:
        sys.path.append(p)

import pandas as pd

from ams.services.equities.EquityFundaDimension import EquityFundaDimension
%matplotlib inline
from pathlib import Path

from ams.config import constants
from ams.services import twitter_service
from ams.services import ticker_service

from statistics import mean
import numpy as np
from ams.services import file_services
from typing import List
from ams.notebooks.twitter.twitter_ml_utils import WorkflowMode
from ams.notebooks.twitter import twitter_ml_utils
from ams.utils import date_utils

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
twitter_folder = 'twitter'

tr_file_path = Path(constants.TWITTER_OUTPUT_RAW_PATH, "tip_ranked", "main", "tip_rank_2020-12-14_22-48-27-354.17.parquet")

In [None]:
%%time

learning_prep_dir = Path(constants.TWITTER_GREAT_REDUCTION_DIR, "main")
df_twitter_raw = twitter_ml_utils.load_twitter_raw(learning_prep_dir=learning_prep_dir)

cat_uniques = None
model_xgb = None

workflow_mode = WorkflowMode.Prediction
# workflow_mode = WorkflowMode.Training
predict_date_str = "2020-10-26"
num_hold_days = 5

dt = date_utils.parse_std_datestring(predict_date_str)
if date_utils.is_stock_market_closed(dt):
    raise Exception("No can do. Market closed.")

if workflow_mode is WorkflowMode.Training:
    df_twitter_raw = df_twitter_raw[df_twitter_raw["date"] < predict_date_str]
else:
    print("Prediction.")
    #today_dt_str = date_utils.get_standard_ymd_format(datetime.now())
    df_twitter_raw = df_twitter_raw[df_twitter_raw["date"] == predict_date_str]
    model_xgb = twitter_ml_utils.load_model_for_prediction()
    cat_uniques = model_xgb.cat_uniques
    
print(f"Max date: {df_twitter_raw['date'].max()}")
print(f"Num records: {df_twitter_raw.shape[0]:,}")

In [None]:
%%time 

twitter_ml_utils.show_distribution(df=df_twitter_raw)

In [None]:
%%time

df_booled = twitter_service.convert_to_bool(df=df_twitter_raw)

In [None]:
%%time

df_twitter = twitter_ml_utils.convert_twitter_to_numeric(df=df_booled)
df_twitter.head()

In [None]:
%%time

df_stock_data = twitter_ml_utils.get_twitter_stock_data(df_tweets=df_twitter, 
                                                        num_hold_days=num_hold_days, 
                                                        workflow_mode=workflow_mode)
df_stock_data.head()

In [None]:
# %%time

df_rec_quart_drop = twitter_service.get_all_quarterly_data_for_twitter()

columns_fundy = list(df_rec_quart_drop.columns)

In [None]:
%%time

df_result = twitter_ml_utils.merge_fundies_with_stock(df_stock_data=df_stock_data)
df_result.head()


In [None]:
df_drop_init = df_result.dropna(subset=["date"]).drop(columns="lastupdated_eq_fun")

df_drop_future = df_drop_init[df_drop_init["date"] > df_drop_init["calendardate"]]

df_drop_future = df_drop_future.sort_values(by=["ticker", "date", "calendardate"], ascending=False)

df_stock_and_quarter = df_drop_future.drop_duplicates(subset=["ticker", "date"], keep="first")

# df_stock_and_quarter.shape[0]
df_drop_init.shape[0]

In [None]:
df_nas_tickers_info, cat_uniques = ticker_service.get_nasdaq_tickers(cat_uniques=cat_uniques)

In [None]:
%time

col_ticker = "ticker_drop"

df_stock_quart_info = pd.merge(df_stock_and_quarter, df_nas_tickers_info, how='inner', left_on=["ticker"], right_on=[col_ticker])
df_sqi = df_stock_quart_info.drop(columns=[col_ticker])

df_sqi.shape[0]

In [None]:
%%time

df_stock_renamed = df_sqi.rename(columns={"ticker": "f22_ticker"})

if 'None' in df_stock_renamed.columns:
    df_stock_renamed = df_stock_renamed.drop(columns=['None'])

df_merged = pd.merge(df_twitter, df_stock_renamed, how='inner', left_on=["f22_ticker","date"], right_on=["f22_ticker","date"])

print(f'Num merged: {df_merged.shape[0]}')

In [None]:
df_days = twitter_ml_utils.add_days_since_quarter_results(df=df_merged)

In [None]:
df_days["future_date"].max()

In [None]:
%%time

df_days_of = twitter_ml_utils.add_calendar_days(df=df_days)

df_days_of["future_date"].max()

In [None]:
%%time

df_dd = twitter_ml_utils.add_nasdaq_roi(df=df_days_of)

In [None]:
%%time

if workflow_mode == WorkflowMode.Training:
    df_thin_rabbit = twitter_service.add_buy_sell(df=df_dd)
else:
    df_thin_rabbit = df_dd

In [None]:
df_thin_rabbit["original_close_price"] = df_thin_rabbit["close"]
df_thin_rabbit["date"].max()
print(f'Num df_thin_rabbit: {df_thin_rabbit.shape[0]}')

In [None]:
# save_twitter_stock_join(df=df_thin_rabbit)

In [None]:
%%time
cols_fundy_numeric = list(set(columns_fundy) - {"ticker", 'calendardate', 'datekey', 'reportperiod'})

df_no_z = twitter_service.fill_null_numeric(df=df_thin_rabbit, cols_fundy_numeric=cols_fundy_numeric)

In [None]:
%%time
    
df_since_sma = twitter_ml_utils.add_sma_stuff(df=df_no_z)

In [None]:
%%time

df_since_sma["purchase_date"] = df_since_sma["date"]

In [None]:
%%time

df_days_until = ticker_service.add_days_until_sale(df=df_since_sma)

In [None]:
%%time
df = twitter_service.refine_pool(df=df_days_until, min_volume=None, min_price=None, max_price=None)
df = twitter_service.omit_columns(df=df)
df_tweet_counted = twitter_service.add_tweet_count(df=df).drop(columns=["calendardate", "reportperiod", "dimension", "datekey"])

In [None]:
df_tweet_counted["future_date"].max()

In [None]:
# df_winnowed = twitter_ml_utils.truncate_avail_columns(df=df_tweet_counted)

In [None]:
print(df_tweet_counted.shape[0])

df_ranked = twitter_ml_utils.add_tip_ranks(df=df_tweet_counted, tr_file_path=tr_file_path)

In [None]:
df_ranked["future_date"].max()

In [None]:
print(df_ranked.shape[0])

df_ticker_hotted, unique_tickers = ticker_service.make_f22_ticker_one_hotted(df_ranked=df_ranked, cat_uniques=cat_uniques)

cat_uniques["f22_ticker"] = unique_tickers
print(f"{len(cat_uniques['f22_ticker'])}")

narrow_cols = list(df_ticker_hotted.columns)

In [None]:
%%time

df_train = df_ticker_hotted

print(df_train.shape[0])
print(df_train["future_date"].max())
print(len(cat_uniques["f22_ticker"]))

if workflow_mode is WorkflowMode.Training:
    # sac_roi_list = twitter_ml_utils.find_ml_pred_perf(df=df_train)
#     sac_roi_list = twitter_ml_utils.torch_non_linear(df=df_train, narrow_cols=narrow_cols)
    sac_roi_list = twitter_ml_utils.xgb_learning(df=df_train, narrow_cols=narrow_cols, cat_uniques=cat_uniques)    

In [None]:
if workflow_mode is WorkflowMode.Training: 
    startup_cash = 1000

    investment = startup_cash
    for s in sac_roi_list:
        investment = (investment * s) + investment

    print(f"roi amount: {investment}")
    print(sac_roi_list)

In [None]:
from ams.services import pickle_service
from datetime import datetime

overwrite_file = False
if workflow_mode is WorkflowMode.Prediction:
    def get_data_for_predictions(df: pd.DataFrame, narrow_cols: List[str]):
        feature_cols = twitter_service.get_feature_columns(narrow_cols)

        return np.array(df[feature_cols])

    X_predict = get_data_for_predictions(df=df_ticker_hotted, narrow_cols=narrow_cols)

    prediction = model_xgb.model.predict(X_predict)

    df_ticker_hotted["prediction"] = prediction
    
    df_buy = df_ticker_hotted[df_ticker_hotted["prediction"] == 1][["f22_ticker", "purchase_date"]]
    df_buy["num_hold_days"] = num_hold_days
    df_buy["run_timestamp"] = datetime.timestamp(datetime.now())
    
    print(df_buy.head(10))
    
    rows = df_buy.to_dict('records')

    from ams.services import csv_service
    csv_service.write_dicts_as_csv(output_file_path=constants.TWITTER_PREDICTIONS_PATH,
                                  overwrite=overwrite_file,
                                  rows=rows)