In [72]:
# Importing the required libraries
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import hvplot
import hvplot.pandas
from pathlib import Path
from pystreamlit import InsiderDataFrame
import pickle as pckl

#Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [73]:

# Create charts for the stocks insider trading.
def create_charts():
    # Create a chart that shows all the insider trades for the stock
    stock_scatter_plot = stock_df.hvplot.scatter(y="Value",title=f"Insider Trades for {stock} by Date", ylabel="Value of Trades",rot=90, cmap="plasma").opts(yformatter='$%.0f')

    # Create a chart that shows the closing price on the insider trading days

    stock_insider_trades_plot = stock_df.hvplot.scatter(y="ClosePrice", color="orange", title=f"{stock} Insider Trades by Price", ylabel="Stock Price",rot=45, width=1000).opts(yformatter='$%.0f')
    
    # Create a chart to show the stock price overtime
    stock_price_df  = insider.df_tickers
    stock_price_df  = stock_price_df[stock].loc[pd.Timestamp(insider.fromdate):pd.Timestamp(insider.todate)]
    stock_price_plot = stock_price_df.hvplot(color="blue", title=f"{stock} Insider Trades and Stock Price", ylabel="Closing Price",rot=45, width=1000).opts(yformatter='$%.0f')

    #Overlay the stock price and the insider trading data
    overlay_stock_Plot = stock_price_plot * stock_insider_trades_plot


    #Visualize the signals

    # Visualize exit position relative to close price
    exit = stock_df[stock_df['Trend'] == -0.0]['ClosePrice'].hvplot.scatter(
        color='orange',
        marker='v',
        size=200,
        legend=False,
        ylabel='Price in $',
        width=1000,
        height=400
    )

    # Visualize entry position relative to close price
    entry = stock_df[stock_df['Trend'] == 1.0]['ClosePrice'].hvplot.scatter(
        color='purple',
        marker='^',
        size=200,
        legend=False,
        ylabel='Price in $',
        width=1000,
        height=400
    )

    # Visualize close price for the investment

    stock_price_df = insider.df_tickers[stock]

    security_close = stock_price_df.hvplot(
        line_color='grey',
        ylabel='Price in $',
        width=1000,
        height=400
    )


    # Create the overlay plot
    entry_exit_plot = security_close * entry * exit

    # Show the plot
    entry_exit_plot.opts(
        title=f"{stock} Price Impact After Insider Trades (1 Week)"
)


    #Display the charts
    display(stock_scatter_plot)
    display(overlay_stock_Plot)
    display(entry_exit_plot)
   


In [74]:
# Create the charts for the stocks and trades for the stock
create_charts()

In [75]:
def SVM_model(stock_df):

    # Get the Data ready for running the model

    # Import modules for Logistic Regression
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    from sklearn.metrics import classification_report, ConfusionMatrixDisplay
    from pandas.tseries.offsets import DateOffset
    import datetime as dt
    from sklearn import svm

    stock_df = stock_df.sort_index(ascending=True)

    # Create the X for testing
    X = stock_df[['Price','Qty','Value',"Owned","ΔOwn",'ClosePrice']].dropna().copy()


    y = stock_df['Trend'].copy()

    # Select the start of the training period
    training_begin = X.index.min() 

    # Select the ending period for the training data with an offset of 3 months
    training_end =  (X.index.min() + DateOffset(months=30)).date()


    # Generate the X_train and y_train DataFrames
    X_train = X.loc[training_begin:training_end]
    y_train = y.loc[training_begin:training_end]
        

    # Generate the X_test and y_test DataFrames
    X_test = X.loc[training_end:]
    y_test = y.loc[training_end:]


    # Generate the X_test and y_test DataFrames
    X_test = X.loc[training_end:]
    y_test = y.loc[training_end:]

    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Apply the scaler model to fit the X_train data
    X_scaler = scaler.fit(X_train)

    # Transform the X_train and X_test DataFrames using the X_scaler
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)  

    # Create the classifier model
    svm_model = svm.SVC()

    # Fit the model to the data using X_train_scaled and y_train
    svm_model = svm_model.fit(X_train_scaled, y_train)

    # Use the trained model to predict the trading signals for the training data
    testing_signal_predictions = svm_model.predict(X_test_scaled)

    # Use the trained model to predict the trading signals for the training data
    training_signal_predictions = svm_model.predict(X_train_scaled)

    # Display the sample predictions
    training_signal_predictions[:10]
    

    #Evaluate the model using a classification report - Testing
    training_report = classification_report(y_test, testing_signal_predictions)
    print("---Classification report for testing data----")
    print(training_report)

    filename = f'./models/{stock}_SVM_model.sav'
    with open(filename,'wb') as file:
        pckl.dump(svm_model,file)



In [76]:
def logistic_regression_model_balanced_data(stock_df):
    # Import modules for Logistic Regression
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.metrics import balanced_accuracy_score
    from imblearn.metrics import classification_report_imbalanced

    # Create X, or features DataFrame
    features = stock_df[['Price','Qty','Value',"Owned","ΔOwn",'ClosePrice']]

    # Create y, or target DataFrame
    target = stock_df['Trend']

    # Use train_test_split to separate the data
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.3,random_state=7)


    #Scale the data
    scaler = StandardScaler()

    X_scaler = scaler.fit(X_train)

    X_train_scaled = X_scaler.transform(X_train)

    X_test_scaled = X_scaler.transform(X_test)

    random_oversampler = RandomOverSampler(random_state=1)
    X_resampled, y_resampled = random_oversampler.fit_resample(X_train_scaled,y_train)

    #Instantiate a model
    logistic_regression_model_resampled = LogisticRegression(max_iter=1000,random_state=7)

    # Fit the model
    logistic_regression_model_resampled.fit(X_resampled, y_resampled)

    # Generate predictions from the model we just fit
    predictions_resampled = logistic_regression_model_resampled.predict(X_test_scaled)

    baso_resampled = balanced_accuracy_score(y_test, predictions_resampled)

    print("---------------------------------------------------------------------------")
    print("Accuracy score for Oversampled, Balanced Data")
    print(baso_resampled)
   

    # Print the Confusion matrix for data with oversampling
    print("Confusion Matrix report for Oversampled, Balanced Data")
    display(confusion_matrix(y_test, predictions_resampled))
   

    # Print the classification report for the resampled data
    print("classification report for Oversampled, Balanced Data")
    print(classification_report_imbalanced(y_test, predictions_resampled))
    print("----------------------------------------------------------------------------")

    filename = f'./models/{stock}_logistics_model.sav'
    with open(filename,'wb') as file:
        pckl.dump(logistic_regression_model_resampled,file)


In [77]:
#executing the function defined above
# instantiate insider class
insider = InsiderDataFrame("insider_data_v2.csv")

for stock in ['AMZN','GOOG','MSFT','TSLA']:

    # Instantiate the Insider Class
    stock_df = insider.get_processed_df(stock) 

    # Droping any null values due to filing date being on a non trading day.
    stock_df = stock_df.dropna()

    # Calling the Logistic Regression Model
    logistic_regression_model_balanced_data(stock_df)
    SVM_model(stock_df)
    create_charts()

<class 'pandas.core.frame.DataFrame'>
Index: 1509 entries, 2023-06-05 to 2019-07-02
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  1509 non-null   object 
 1   Price   1509 non-null   float64
 2   Qty     1509 non-null   int64  
 3   Owned   1509 non-null   int64  
 4   ΔOwn    1509 non-null   int64  
 5   Value   1509 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 82.5+ KB
---------------------------------------------------------------------------
Accuracy score for Oversampled, Balanced Data
0.53125
Confusion Matrix report for Oversampled, Balanced Data


array([[21, 11],
       [19, 13]])

classification report for Oversampled, Balanced Data
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.53      0.66      0.41      0.58      0.52      0.27        32
        1.0       0.54      0.41      0.66      0.46      0.52      0.26        32

avg / total       0.53      0.53      0.53      0.52      0.52      0.27        64

----------------------------------------------------------------------------
---Classification report for testing data----
              precision    recall  f1-score   support

         0.0       0.31      0.14      0.20        28
         1.0       0.52      0.74      0.61        35

    accuracy                           0.48        63
   macro avg       0.41      0.44      0.40        63
weighted avg       0.43      0.48      0.43        63



<class 'pandas.core.frame.DataFrame'>
Index: 1509 entries, 2023-06-05 to 2019-07-02
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  1509 non-null   object 
 1   Price   1509 non-null   float64
 2   Qty     1509 non-null   int64  
 3   Owned   1509 non-null   int64  
 4   ΔOwn    1509 non-null   int64  
 5   Value   1509 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 82.5+ KB
---------------------------------------------------------------------------
Accuracy score for Oversampled, Balanced Data
0.4536931818181818
Confusion Matrix report for Oversampled, Balanced Data


array([[22, 42],
       [24, 31]])

classification report for Oversampled, Balanced Data
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.48      0.34      0.56      0.40      0.44      0.19        64
        1.0       0.42      0.56      0.34      0.48      0.44      0.20        55

avg / total       0.45      0.45      0.46      0.44      0.44      0.19       119

----------------------------------------------------------------------------
---Classification report for testing data----
              precision    recall  f1-score   support

         0.0       0.54      0.97      0.69        68
         1.0       0.83      0.15      0.25        67

    accuracy                           0.56       135
   macro avg       0.68      0.56      0.47       135
weighted avg       0.68      0.56      0.47       135



<class 'pandas.core.frame.DataFrame'>
Index: 1509 entries, 2023-06-05 to 2019-07-02
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  1509 non-null   object 
 1   Price   1509 non-null   float64
 2   Qty     1509 non-null   int64  
 3   Owned   1509 non-null   int64  
 4   ΔOwn    1509 non-null   int64  
 5   Value   1509 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 82.5+ KB
---------------------------------------------------------------------------
Accuracy score for Oversampled, Balanced Data
0.5404040404040404
Confusion Matrix report for Oversampled, Balanced Data


array([[7, 4],
       [5, 4]])

classification report for Oversampled, Balanced Data
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.58      0.64      0.44      0.61      0.53      0.29        11
        1.0       0.50      0.44      0.64      0.47      0.53      0.28         9

avg / total       0.55      0.55      0.53      0.55      0.53      0.28        20

----------------------------------------------------------------------------
---Classification report for testing data----
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        11
         1.0       0.21      0.75      0.33         4

    accuracy                           0.20        15
   macro avg       0.11      0.38      0.17        15
weighted avg       0.06      0.20      0.09        15



<class 'pandas.core.frame.DataFrame'>
Index: 1509 entries, 2023-06-05 to 2019-07-02
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Ticker  1509 non-null   object 
 1   Price   1509 non-null   float64
 2   Qty     1509 non-null   int64  
 3   Owned   1509 non-null   int64  
 4   ΔOwn    1509 non-null   int64  
 5   Value   1509 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 82.5+ KB
---------------------------------------------------------------------------
Accuracy score for Oversampled, Balanced Data
0.6615079365079365
Confusion Matrix report for Oversampled, Balanced Data


array([[25, 11],
       [13, 22]])

classification report for Oversampled, Balanced Data
                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.66      0.69      0.63      0.68      0.66      0.44        36
        1.0       0.67      0.63      0.69      0.65      0.66      0.43        35

avg / total       0.66      0.66      0.66      0.66      0.66      0.44        71

----------------------------------------------------------------------------
---Classification report for testing data----
              precision    recall  f1-score   support

         0.0       0.45      1.00      0.62        30
         1.0       1.00      0.14      0.25        42

    accuracy                           0.50        72
   macro avg       0.73      0.57      0.44        72
weighted avg       0.77      0.50      0.41        72

