In [200]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [201]:
# Read in and look at the data
df = pd.read_csv('stock_trans_data.csv')

df.head()

Unnamed: 0,symbol,historical_dates,open,close,purchase_date,purchase_price,sell_date,sell_price,actual_return,days_to_sell,take_profit_price,stop_out_price,hit_take_profit
0,CRM,"[datetime.date(2018, 11, 21), datetime.date(20...","[124.2, 121.33, 124.48, 125.12, 138.48, 139.85...","[123.59, 122.03, 126.41, 127.54, 140.64, 139.7...",2023-11-20,223.2,2023-11-30,249.48,315.37,10.0,231.573,220.968,1
1,NET,"[datetime.date(2019, 9, 13), datetime.date(201...","[18.1, 18.6, 18.45, 18.83, 20, 18.9, 20.1, 21....","[18, 18.63, 18.75, 19.59, 18.75, 19.87, 20.96,...",2023-11-20,73.37,2023-12-14,83.38,340.34,24.0,81.619,72.6363,1
2,RCL,"[datetime.date(2018, 11, 21), datetime.date(20...","[108.51, 107.1, 111.04, 110.53, 111.38, 113.05...","[107.37, 110.28, 111.46, 110.74, 112.54, 111.6...",2023-11-20,105.26,2023-11-21,102.96,-57.5,1.0,107.735,104.2074,0
3,SPOT,"[datetime.date(2018, 11, 21), datetime.date(20...","[130.76, 130, 131, 137.368, 140.99, 137.08, 13...","[131.98, 129.41, 138, 138.75, 138.24, 138.96, ...",2023-11-20,178.86,2023-12-04,193.85,224.84,14.0,187.075,177.0714,1
4,FERG,"[datetime.date(2021, 3, 8), datetime.date(2021...","[120.51, 123, 124, 124, 123.72, 125.48, 126.66...","[119.49, 122.35, 120.6, 125, 124.64, 126.66, 1...",2023-11-20,164.78,2023-11-20,164.63,-2.41,0.0,165.058,163.1322,0


In [202]:
# Function to calculate SMA5
def calculate_sma5(close_prices):
    sma5 = []
    for i in range(len(close_prices)):
        if i < 4:  # Predict the SMA5 for the first four dates
            sma5.append(close_prices[i])
        else:
            sma5.append(np.mean(close_prices[i-4:i+1]))
    return sma5

# Apply the function to the 'close' column
df['SMA5'] = df['close'].apply(lambda x: calculate_sma5(eval(x)))

# Inspect the result
df[['close', 'SMA5']].head()

Unnamed: 0,close,SMA5
0,"[123.59, 122.03, 126.41, 127.54, 140.64, 139.7...","[123.59, 122.03, 126.41, 127.54, 128.042, 131...."
1,"[18, 18.63, 18.75, 19.59, 18.75, 19.87, 20.96,...","[18, 18.63, 18.75, 19.59, 18.744, 19.118000000..."
2,"[107.37, 110.28, 111.46, 110.74, 112.54, 111.6...","[107.37, 110.28, 111.46, 110.74, 110.478, 111...."
3,"[131.98, 129.41, 138, 138.75, 138.24, 138.96, ...","[131.98, 129.41, 138, 138.75, 135.276, 136.672..."
4,"[119.49, 122.35, 120.6, 125, 124.64, 126.66, 1...","[119.49, 122.35, 120.6, 125, 122.4159999999999..."


In [203]:
def calculate_sma20(close_prices):
    sma20 = []
    for i in range(len(close_prices)):
        if i < 19:  # Not enough data for SMA5
            sma20.append(close_prices[i])
        else:
            sma20.append(np.mean(close_prices[i-19:i+1]))
    return sma20

# Apply the function to the 'close' column
df['SMA20'] = df['close'].apply(lambda x: calculate_sma20(eval(x)))

# Inspect the result
df[['close', 'SMA20']].head()

Unnamed: 0,close,SMA20
0,"[123.59, 122.03, 126.41, 127.54, 140.64, 139.7...","[123.59, 122.03, 126.41, 127.54, 140.64, 139.7..."
1,"[18, 18.63, 18.75, 19.59, 18.75, 19.87, 20.96,...","[18, 18.63, 18.75, 19.59, 18.75, 19.87, 20.96,..."
2,"[107.37, 110.28, 111.46, 110.74, 112.54, 111.6...","[107.37, 110.28, 111.46, 110.74, 112.54, 111.6..."
3,"[131.98, 129.41, 138, 138.75, 138.24, 138.96, ...","[131.98, 129.41, 138, 138.75, 138.24, 138.96, ..."
4,"[119.49, 122.35, 120.6, 125, 124.64, 126.66, 1...","[119.49, 122.35, 120.6, 125, 124.64, 126.66, 1..."


In [204]:
# Function to calculate SMA5 slope based on the previous 4 days
def calculate_sma5_slope_with_prediction(sma5_list):
    sma5_slope = []
    for i in range(len(sma5_list)):
        if i < 4:  # Predict the slope for the first four entries
            if i > 0:  # Calculate the slope between the first two available points
                predicted_slope = round((sma5_list[i] - sma5_list[i-1]) , 2)
            else:
                predicted_slope = 0  # If no prior data, assume flat slope
            sma5_slope.append(predicted_slope)
        else:
            # Slope = (SMA5[today] - SMA5[4 days ago]) / 4
            slope = round((sma5_list[i] - sma5_list[i-4]) / 4, 2)
            sma5_slope.append(slope)
    return sma5_slope

# Apply the function to the 'close' column
df['SMA5_Slope'] = df['SMA5'].apply(lambda x: calculate_sma5_slope_with_prediction(x))

# Inspect the result
df[['SMA5', 'SMA5_Slope']].head()

Unnamed: 0,SMA5,SMA5_Slope
0,"[123.59, 122.03, 126.41, 127.54, 128.042, 131....","[0, -1.56, 4.38, 1.13, 1.11, 2.31, 2.25, 2.86,..."
1,"[18, 18.63, 18.75, 19.59, 18.744, 19.118000000...","[0, 0.63, 0.12, 0.84, 0.19, 0.12, 0.21, 0.1, 0..."
2,"[107.37, 110.28, 111.46, 110.74, 110.478, 111....","[0, 2.91, 1.18, -0.72, 0.78, 0.26, 0.11, 0.43,..."
3,"[131.98, 129.41, 138, 138.75, 135.276, 136.672...","[0, -2.57, 8.59, 0.75, 0.82, 1.82, 0.02, -0.04..."
4,"[119.49, 122.35, 120.6, 125, 122.4159999999999...","[0, 2.86, -1.75, 4.4, 0.73, 0.38, 1.01, 0.1, 0..."


In [205]:
# Function to calculate SMA5 slope based on the previous 4 days
def calculate_sma20_slope_with_prediction(sma20_list):
    sma20_slope = []
    for i in range(len(sma20_list)):
        if i < 19:  # Predict the slope for the first four entries
            if i > 0:  # Calculate the slope between the first two available points
                predicted_slope = round((sma20_list[i] - sma20_list[i-1]) , 2)
            else:
                predicted_slope = 0  # If no prior data, assume flat slope
            sma20_slope.append(predicted_slope)
        else:
            slope = round((sma20_list[i] - sma20_list[i-4]) / 19, 2)
            sma20_slope.append(slope)
    return sma20_slope

# Apply the function to the 'close' column
df['SMA20_Slope'] = df['SMA20'].apply(lambda x: calculate_sma20_slope_with_prediction(x))

# Inspect the result
df[['SMA20', 'SMA20_Slope']].head()

Unnamed: 0,SMA20,SMA20_Slope
0,"[123.59, 122.03, 126.41, 127.54, 140.64, 139.7...","[0, -1.56, 4.38, 1.13, 13.1, -0.92, 3.04, 1.39..."
1,"[18, 18.63, 18.75, 19.59, 18.75, 19.87, 20.96,...","[0, 0.63, 0.12, 0.84, -0.84, 1.12, 1.09, -0.25..."
2,"[107.37, 110.28, 111.46, 110.74, 112.54, 111.6...","[0, 2.91, 1.18, -0.72, 1.8, -0.91, 1.44, 1.18,..."
3,"[131.98, 129.41, 138, 138.75, 138.24, 138.96, ...","[0, -2.57, 8.59, 0.75, -0.51, 0.72, -2.58, 4.3..."
4,"[119.49, 122.35, 120.6, 125, 124.64, 126.66, 1...","[0, 2.86, -1.75, 4.4, -0.36, 2.02, -0.33, -1.8..."


In [206]:
from statistics import mode

# Create a new list of 1s and 0s. 1 = increase from one sma to the next days sma, 0 = decrease from on sma to the next days sma
def calculate_increases(lst):
    probabilities = []
    for i in range(len(lst) - 1):
        if lst[i] < lst[i + 1]:
            probabilities.append(1)
        else:
            probabilities.append(0)
    probabilities.append(0)
    most_common = mode(probabilities)
    return most_common

# create new SMA<value>_prob columns and SMA<value>_slope_prob columns based on the result of the calculate increase function.
def get_increase_probability_and_create_new_column(initial_column, created_column):
    probs = []
    for x in df[initial_column]:
        prob = calculate_increases(x)
        probs.append(prob)
    df[created_column] = probs

# create a list of column names to run this on
old_columns = ['SMA5', 'SMA20', 'SMA5_Slope', 'SMA20_Slope']
for name in old_columns:
    get_increase_probability_and_create_new_column(name, name+'_prob')
    
# Calculate summary statistics for 'open' and 'close' prices
df['open_mean'] = df['open'].apply(lambda x: np.mean(eval(x)))
df['open_std'] = df['open'].apply(lambda x: np.std(eval(x)))
df['close_mean'] = df['close'].apply(lambda x: np.mean(eval(x)))
df['close_std'] = df['close'].apply(lambda x: np.std(eval(x)))
df['SMA5_last'] = df['SMA5'].apply(lambda x: x[-1] if isinstance(x, list) else np.nan)
df['SMA20_last'] = df['SMA20'].apply(lambda x: x[-1] if isinstance(x, list) else np.nan)
df['SMA5_Slope_last'] = df['SMA5_Slope'].apply(lambda x: x[-1] if isinstance(x, list) else np.nan)
df['SMA20_Slope_last'] = df['SMA20_Slope'].apply(lambda x: x[-1] if isinstance(x, list) else np.nan)


# drop all columns that contains lists as their data
df = df.drop(['SMA5','SMA20','SMA5_Slope','SMA20_Slope','open','close','historical_dates'], axis=1)
# inspect the dataframe in its current state
df

Unnamed: 0,symbol,purchase_date,purchase_price,sell_date,sell_price,actual_return,days_to_sell,take_profit_price,stop_out_price,hit_take_profit,...,SMA5_Slope_prob,SMA20_Slope_prob,open_mean,open_std,close_mean,close_std,SMA5_last,SMA20_last,SMA5_Slope_last,SMA20_Slope_last
0,CRM,2023-11-20,223.20,2023-11-30,249.48,315.37,10.0,231.5730,220.9680,1,...,0,0,194.327845,41.438023,194.336309,41.263202,221.680,209.5870,1.84,0.18
1,NET,2023-11-20,73.37,2023-12-14,83.38,340.34,24.0,81.6190,72.6363,1,...,0,0,66.673415,38.953573,66.766436,38.926920,71.524,62.7990,1.66,0.12
2,RCL,2023-11-20,105.26,2023-11-21,102.96,-57.50,1.0,107.7350,104.2074,0,...,0,0,81.516231,25.884062,81.475625,25.903143,103.990,92.5510,1.82,0.24
3,SPOT,2023-11-20,178.86,2023-12-04,193.85,224.84,14.0,187.0750,177.0714,1,...,1,0,174.503435,67.367706,174.563262,67.274711,175.872,168.8515,1.13,0.27
4,FERG,2023-11-20,164.78,2023-11-20,164.63,-2.41,0.0,165.0580,163.1322,0,...,1,0,138.396361,17.321011,138.696428,17.230391,165.182,157.6645,1.37,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,REG,2024-08-07,70.38,,,,,72.4605,69.6762,0,...,0,0,59.874451,9.337154,59.853671,9.362005,69.332,66.5710,0.54,0.08
296,WELL,2024-08-12,115.96,,,,,119.4079,114.8004,0,...,1,0,78.056842,14.406598,78.034288,14.412780,115.906,111.1495,0.86,0.12
297,K,2024-08-12,74.29,2024-08-14,80.09,435.00,2.0,76.4981,73.5471,1,...,0,0,64.160693,5.450146,64.151360,5.449203,73.906,62.9790,1.99,0.18
298,VIRT,2024-08-15,30.31,,,,,31.1678,30.0069,0,...,0,0,23.034135,4.867915,23.023755,4.862777,29.348,28.3755,0.32,0.05


In [207]:
from sklearn.preprocessing import LabelEncoder
# Initialize the Label Encoder
label_encoder = LabelEncoder()
# Fit and transform the 'symbol' column
df['symbol_encoded'] = label_encoder.fit_transform(df['symbol'])
# Fit and transform the 'purchase_date' column
df['purchase_date_encoded'] = label_encoder.fit_transform(df['purchase_date'])
# Fit and transform the 'sell_date' column
df['sell_date_encoded'] = label_encoder.fit_transform(df['sell_date'])

# inspect the dataframe in its current state
df


Unnamed: 0,symbol,purchase_date,purchase_price,sell_date,sell_price,actual_return,days_to_sell,take_profit_price,stop_out_price,hit_take_profit,...,open_std,close_mean,close_std,SMA5_last,SMA20_last,SMA5_Slope_last,SMA20_Slope_last,symbol_encoded,purchase_date_encoded,sell_date_encoded
0,CRM,2023-11-20,223.20,2023-11-30,249.48,315.37,10.0,231.5730,220.9680,1,...,41.438023,194.336309,41.263202,221.680,209.5870,1.84,0.18,49,0,7
1,NET,2023-11-20,73.37,2023-12-14,83.38,340.34,24.0,81.6190,72.6363,1,...,38.953573,66.766436,38.926920,71.524,62.7990,1.66,0.12,105,0,16
2,RCL,2023-11-20,105.26,2023-11-21,102.96,-57.50,1.0,107.7350,104.2074,0,...,25.884062,81.475625,25.903143,103.990,92.5510,1.82,0.24,128,0,1
3,SPOT,2023-11-20,178.86,2023-12-04,193.85,224.84,14.0,187.0750,177.0714,1,...,67.367706,174.563262,67.274711,175.872,168.8515,1.13,0.27,147,0,9
4,FERG,2023-11-20,164.78,2023-11-20,164.63,-2.41,0.0,165.0580,163.1322,0,...,17.321011,138.696428,17.230391,165.182,157.6645,1.37,0.11,68,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,REG,2024-08-07,70.38,,,,,72.4605,69.6762,0,...,9.337154,59.853671,9.362005,69.332,66.5710,0.54,0.08,129,87,105
296,WELL,2024-08-12,115.96,,,,,119.4079,114.8004,0,...,14.406598,78.034288,14.412780,115.906,111.1495,0.86,0.12,178,88,105
297,K,2024-08-12,74.29,2024-08-14,80.09,435.00,2.0,76.4981,73.5471,1,...,5.450146,64.151360,5.449203,73.906,62.9790,1.99,0.18,93,88,104
298,VIRT,2024-08-15,30.31,,,,,31.1678,30.0069,0,...,4.867915,23.023755,4.862777,29.348,28.3755,0.32,0.05,169,89,105


In [208]:
# Create a seperate dataframe for the rows that are still showing as the position being open. This will be used for prediction
future_df = df[(df['sell_price'].isna())]
future_df = future_df.copy()

# drop the open positions from the training and test data
df = df.dropna(subset=['sell_price'])


# Now you have the last 5 open and close prices as features for your model create model used for testing future data
X = df.drop(['hit_take_profit', 'symbol', 'purchase_date', 'sell_date'], axis=1)
y = df['hit_take_profit']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

logging.infof"Accuracy: {accuracy}")
logging.info"Classification Report:\n", classification_rep)



Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        33

    accuracy                           1.00        59
   macro avg       1.00      1.00      1.00        59
weighted avg       1.00      1.00      1.00        59



In [209]:
X_future = future_df.drop(['hit_take_profit', 'symbol', 'purchase_date', 'sell_date'], axis=1)
future_predictions = rf_model.predict(X_future)

In [210]:
future_df['hit_take_profit_predicted'] = future_predictions

In [211]:
future_df.to_csv('future_predictions.csv', index=False)

In [212]:
future_probabilities = rf_model.predict_proba(X_future)
future_probabilities

array([[0.03, 0.97],
       [0.03, 0.97],
       [0.11, 0.89],
       [0.17, 0.83],
       [0.1 , 0.9 ],
       [0.06, 0.94]])