### 1 Imports
Pandas - Data manipulation.

YFinance - Download historical stock price data.

In [1]:
import pandas as pd
import yfinance as yf

### 2 Constants
`N` = amount of days to go in the past, for `Delta_N` and `Average_N`.

`INTERVAL` = daily or weekly data.

`OFFSET` = needed, because data is only available after 3 days.

`THRESHOLD` = the threshold that `Delta_N` has to go over to warrant a _buy_.

`KW_LIST` = a list of all _search terms_ of which the data has been collected.

In [2]:
N = 21
THRESHOLD = 0
INTERVAL = "daily"

if INTERVAL == "daily":
    OFFSET = 0
elif INTERVAL == "weekly":
    OFFSET = 0
    N = int(N / 7)

TARGET = "binary"
TYPE = "delta"
KW_LIST = ["debt", "color", "stocks", "restaurant", "portfolio", "inflation", "housing","dow_jones", "revenue",
           "economics", "credit", "markets", "return", "unemployment", "money", "religion", "cancer", "growth", 
           "investment", "hedge", "marriage", "bonds", "derivatives", "headlines", "profit", "society", "leverage", 
           "loss", "cash", "office", "fine", "stock_market", "banking", "crisis", "happy", "car", "nasdaq", 
           "gains", "finance", "sell", "invest", "fed", "house", "metals", "travel", "returns", "gain", 
           "default", "present", "holiday", "water", "rich", "risk", "gold", "success", "oil", "war", "economy", 
           "chance", "lifestyle", "greed", "food", "movie", "nyse", "ore", "opportunity", "health", "earnings", 
           "arts", "culture", "bubble", "buy", "trader", "tourism", "politics", "energy", "consume", "consumption",
           "freedom", "dividend", "world", "conflict", "kitchen", "forex", "home", "cash", "transaction", "garden",
           "fond", "train", "labor", "fun", "environment", "ring"]

In [3]:
df = pd.read_csv("deployment/data/data.csv")

In [4]:
df.to_csv("deployment/data/data1.csv", index=False)

### 3 Import data

In [5]:
%%capture

kw_df = pd.DataFrame()

# Google Trends data.
date = False
for kw in KW_LIST:
    
    if not date:
        kw_df["Date"] = pd.read_csv(f"data/{INTERVAL}/{kw}.csv")["Date"]
        date = True
        
    kw_df[f"{kw}"] = pd.read_csv(f"data/{INTERVAL}/{kw}.csv")["Adjusted"]

# Historical stock price data.
if INTERVAL == "daily":
    ticker_df = yf.download("DJIA", period="max")
elif INTERVAL == "weekly":
    ticker_df = yf.download("DJIA", period="max", interval="1wk")
    kw_df["Date"] = pd.to_datetime(kw_df["Date"], format="%Y-%m-%d")
    kw_df["Date"] += pd.DateOffset(1)

# Manipulate stock price data.
ticker_df["Change"] = ticker_df["Close"].pct_change()
ticker_df = ticker_df.drop(["Open", "High", "Low", "Adj Close", "Volume", "Close"], axis = 1)
ticker_df = ticker_df.reset_index()

kw_df["Date"] = pd.to_datetime(kw_df["Date"], format="%Y-%m-%d")

### 4 Features

#### 4.1 Time series

In [6]:
df = pd.DataFrame()

df["Date"] = kw_df["Date"]

#### 4.2 Binary and binning.

In [7]:
if TARGET == "binary":
    ticker_df["Target"] = pd.cut(ticker_df["Change"],
                          bins=[-float("inf"), 0, float("inf")],
                          labels=[0, 1])
elif TARGET == "bins":
    ticker_df["Target"] = pd.cut(ticker_df["Change"], 
                          bins=[-float("inf"), -0.025, -0.02, -0.015, -0.01, -0.005, 0, 0.005, 0.01, 0.015, 0.02, 0.025, float("inf")], 
                          labels=[-6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6])

ticker_df = ticker_df.drop("Change", axis=1)

#### 4.3 Delta, delta week, and rolling mean.

In [None]:
%%time

if TYPE == "delta":
    for kw in KW_LIST:
        i = N
        while i < len(df):
            df.loc[i, f"{kw}"] = kw_df.loc[i, f"{kw}"] - kw_df.loc[i-N, f"{kw}"]
            i += 1
        
        df[f"{kw}"] = df[f"{kw}"].shift(OFFSET)

elif TYPE == "dweek":
    for kw in KW_LIST:
        i = N
        while i < len(kw_df):
            df.loc[i, f"{kw}"] = kw_df.loc[i-7:i, f"{kw}"].mean() - kw_df.loc[i-N-7:i-N, f"{kw}"].mean()
            i += 1
        
        df[f"{kw}"] = df[f"{kw}"].shift(OFFSET)
        
elif TYPE == "rolling":
    for kw in KW_LIST:
        i = N
        while i < len(kw_df):
            df[f"{kw}"] = kw_df[f"{kw}"].rolling(N).mean()
            i += 1
        
        df[f"{kw}"] = df[f"{kw}"].shift(OFFSET)


df = df[N+OFFSET:]

ticker_df.insert(1, "lag_1", ticker_df["Target"].shift(1))

df = df.merge(ticker_df, on="Date")

df = df.drop("Date", axis=1)

### 5 Convert to CSV
This has to happen so that the CSV-file can be fed into the model.

In [None]:
df.round(3).to_csv(f"ml_models/{INTERVAL}_{TYPE}_{TARGET}.csv")