In [None]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
import json
import requests

from functools import reduce
import yfinance as yf

import datetime as dt
from datetime import datetime, date, time, timedelta
from dateutil.relativedelta import relativedelta

# Import PlotLy Dependencies
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
# Run the following Jupyter Notebooks first
%run data_source.ipynb
%run inflation.ipynb
%run productivity.ipynb
%run jobs.ipynb
%run housing.ipynb

# Inflation

In [None]:
# View Inflation Data
inflation_change_df

# Productivity

In [None]:
# View Productivity Data. Notice that the data is lagging. For this reason, we will use only the GDP number for productivity.
productivity_index_monthly_data

In [None]:
# Decided to just use the GDP data only for productivity
gdp_monthly_data

# Jobs

In [None]:
# View Jobs Data
jobs_index_change_df

# Housing

In [None]:
# View Housing Data. Notice that the data is lagging. For this reason, we will use a 12 month moving average to bring the data to latest date that will align with other dataframes.
housing_monthly_data

In [None]:
updated_housing_monthly_data = pd.DataFrame()

dates_list = list(housing_monthly_data["Date"])
i_list = list(housing_monthly_data["Overall Housing Index"])[-12:]

change_list = list(housing_monthly_data["Change in Overall Housing Index (%)"])

# last_change = list(housing_monthly_data["Housing Change in Velocity (basis points)"])[-1]
velocity_list = []

for i in range(5):
    # New Dates
    dates_list.append(dates_list[-1] + pd.DateOffset(months=1))
    
    # Index Averages to add
    avg = sum(i_list[-12:]) / 12
    
    # Change (%) 
    change = ((avg - i_list[-1]) / i_list[-1]) * 100  
    i_list.append(avg)
    
    # Velocity
    velocity = (change - change_list[-1]) * 100
    
    change_list.append(change)
    velocity_list.append(velocity)
        

fabricated_index_list = i_list[-5:]

updated_housing_monthly_data["Date"] = dates_list

indexes = list(housing_monthly_data["Overall Housing Index"]) + fabricated_index_list
updated_housing_monthly_data["Overall Housing Index"] = indexes

changes = list(housing_monthly_data["Change in Overall Housing Index (%)"]) + change_list

updated_housing_monthly_data["Change in Overall Housing Index (%)"] = change_list

      
velocities = list(housing_monthly_data["Housing Change in Velocity (basis points)"]) + velocity_list
updated_housing_monthly_data["Housing Change in Velocity (basis points)"] = velocities


updated_housing_monthly_data.tail(12)

# Interest Rate

In [None]:
# GS10 - Market Yield on U.S. Treasury Securities at 10-Year Constant Maturity. This will be used to gauge rates
interest_rate_df = pd.DataFrame(fred_api_function("GS10"))

# Rename the 'Value' column to '10-Year Real Interest Rate'
interest_rate_df.rename(columns={'Value': '10-Year Treasury Yield'}, inplace=True)

interest_rate_df["10-Yr Yield (basis points)"] = interest_rate_df["10-Year Treasury Yield"] * 100

interest_rate_df

In [None]:
# Convert Dates to the appropriate data type
inflation_change_df["Date"] = pd.to_datetime(inflation_change_df["Date"])
gdp_monthly_data["Date"] = pd.to_datetime(gdp_monthly_data["Date"])
jobs_index_change_df["Date"] = pd.to_datetime(jobs_index_change_df["Date"])
updated_housing_monthly_data["Date"] = pd.to_datetime(updated_housing_monthly_data["Date"])
interest_rate_df["Date"] = pd.to_datetime(interest_rate_df["Date"])

In [None]:
# Merge the DataFrames
combined_df = pd.DataFrame()

# Merge Inflation and Productivity DataFrames on the 'Date' column
combined_df = pd.merge(inflation_change_df, gdp_monthly_data, on='Date')

# Merge Combined DataFrames to Jobs DataFrame on the 'Date' column
combined_df = pd.merge(combined_df, jobs_index_change_df, on='Date')

# Merge Combined DataFrames to Housing DataFrame on the 'Date' column
combined_df = pd.merge(combined_df, updated_housing_monthly_data, on='Date')

# Merge Combined DataFrames to Interest Rate DataFrame on the 'Date' column
combined_df = pd.merge(combined_df, interest_rate_df, on='Date')

combined_df

In [None]:
# Reduce columns of the DataFrame to only the relevant columns

selected_columns = ["Date", "Inflation Change in Velocity (basis points)", "GDP Change in Velocity (basis points)", \
                   "Jobs Index Change in Velocity (basis points)", "Housing Change in Velocity (basis points)", \
                   "10-Yr Yield (basis points)"]


reduced_df = combined_df[selected_columns]

reduced_df

In [None]:
# Lock or Float Historical Lock/Float Best Decisions
yield_list = reduced_df["10-Yr Yield (basis points)"]

lock_yn_list = []

def generate_lock_decisions(yields):
    # Initialize an empty list to store the output
    output = []

    # Iterate through the yields starting from the second element
    for i in range(0, len(yields)):
        try:
            # Check if the current yield is higher than the previous one
            if yields[i] < yields[i + 1]:
                output.append(1)
            else:
                output.append(0)
        except:
            output.append("predict this")

    return output

lock_yn_list = generate_lock_decisions(yield_list)

reduced_df["lock_yn"] = lock_yn_list

# Notice the "predict this" on the last row to remind us to what decision we are trying to determine
reduced_df

In [None]:
# # Drop N/As in the top 6 rows
dropped_df = reduced_df.iloc[6:]
dropped_df

In [None]:
# Drop Last Row to start building the ML model
dropped_last_row_df = dropped_df.iloc[:-1]
dropped_last_row_df
# Drop first column
dropped_first_column_df = dropped_last_row_df.iloc[:, 1:]
dropped_first_column_df



In [None]:
# Features Data
features = dropped_first_column_df.iloc[:, :-1]
features_data_matrix = features.values
print(len(features_data_matrix))
# Decision Labels
labels = dropped_first_column_df.iloc[:, -1]
labels_data = labels.values.astype(int)
print(len(labels_data))

In [None]:
features_data_matrix

In [None]:
labels_data

# Machine Learning Models

Determine Independent variables (X) and Dependent variables (y). Then split the data into train and test data

In [None]:
from sklearn.model_selection import train_test_split
X = features_data_matrix
y = labels_data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, stratify=y)
print(X.shape, y.shape)

In [None]:
import random
random.shuffle(X)
shuffled_X = np.array(X).reshape((431, 5))
# shuffled_X

random.shuffle(y)
shuffled_y = np.array(y).reshape((431,))
# shuffled_y

X_train, X_test, y_train, y_test = train_test_split(shuffled_X, shuffled_y, random_state=5, stratify=shuffled_y)
print(shuffled_X.shape, shuffled_y.shape)

### Logisitic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Enter the new independent variables to predict decision
new_data = np.array([[-42.802841, 118.172447, 64.554294, -1.073848, 438.0 ]])
new_data

In [None]:
# Predict the decision of the new data point
predictions = classifier.predict(new_data)
print("Classes are either 0 (Float) or 1 (Lock)")
print(f"The new point was classified as: {predictions}")
print("This means that it is predicting that the Treasury yield will go down and to float. Floating is suggested")

In [None]:
# How would the model predict the test data
test_predictions = classifier.predict(X_test)
results_df = pd.DataFrame({"Prediction": test_predictions, "Actual": y_test})
results_df

In [None]:
match_yn = []
for index, row in results_df.iterrows():
    if row[0] == row[1]:
        match_yn.append(1)
    else:
        match_yn.append(0)
        
results_df["match_yn"] = match_yn
results_df

In [None]:
# Calculate the percentage of matched
count_results = results_df['match_yn'].value_counts()[1]
percentage = (count_results / len(results_df['match_yn'])) * 100
percentage

### Decision Tree

In [None]:
features = list(dropped_first_column_df.columns[:5])
features

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Random Forest


###### We can use a random forest classifier to determine how important each feature is to the classify the lock_yn decision

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# How would a random forest classifier score?
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
features_list = ['Monthly Change in Inflation Index (%)', 'Monthly Change in GDP (%)', 'Monthly Change in Jobs Index (%)', \
            'Monthly Change in Overall Housing Index (%)']

# We can sort the features by their importance
sorted(zip(importances, features_list), reverse=True)

### Grid Search

In [None]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create the SVC Model (Support Vector Classifier)
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

In [None]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
# List the best score
print(grid.best_score_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["Lock", "Float"]))

In [None]:
len(predictions)

In [None]:
res_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
res_df

In [None]:
match_yn1 = []
for index, row in res_df.iterrows():
    if row[0] == row[1]:
        match_yn1.append(1)
    else:
        match_yn1.append(0)
        
res_df["match_yn"] = match_yn1
res_df

In [None]:
# Calculate the percentage of matched
ct_results = res_df['match_yn'].value_counts()[1]
perc = (ct_results / len(res_df['match_yn'])) * 100
perc

### PCA

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components = 5)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
plt.plot(explained_variance)

In [None]:
X_train

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 1)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
len(y_pred)

In [None]:
pca_df = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
pca_df

In [None]:
match_yn_1 = []
for index, row in pca_df.iterrows():
    if row[0] == row[1]:
        match_yn_1.append(1)
    else:
        match_yn_1.append(0)
        
pca_df["match_yn"] = match_yn_1
pca_df

In [None]:
# Calculate the percentage of matched
count_results_1 = pca_df["match_yn"].value_counts()[1]
percentage_1 = (count_results / len(pca_df['match_yn'])) * 100
percentage_1

In [None]:
X_train

In [None]:
new_pred = classifier.predict(new_data)
new_pred
# 0 = Float