In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Data to be worked on is comprised of seven main categories

# Date: time period we are working on
# High: highest price that a stock reached
# Low: lowest price that a stock reached
# Open: price of stock soon as a market opens on a particular day
# Volume: number of shares traded
# Closing Price: cost of shares agreed on by traders at the end of a particular day
# Adjusted Close: cost of share at the end of a day, taking into consediration stock splits, new stock offerings and dividens

# For this study, the critical datapoint we are most interested in and the pivot to this study is the Closing Price

# assigning a random seed
tf.random.set_seed(42)
np.random.seed(42)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# importing the data
data= pd.read_csv('../kaggle_dataset/A.CSV')
# printing the length to verify that the data has been impored successfuly
len(data)

# printing head and tail of the data set to see the start and end date
print(data.head())
print(data.tail())

# checking for any missing data - we pass the sum function it gives us a summary
print(data.isna().sum())

# in case we have any missing data - the dropna method would drop those rows from the dataset
data= data.dropna()

# converting the Date column to datetime data type
data['Date']= pd.to_datetime(data['Date'])
# adding a new column of Year
data['Year']= pd.to_datetime(data['Date']).dt.year
# chosing a desired year
desired_year= 2001
# selecting a specific year
year_data = data[data['Date'].dt.year == desired_year]

print(f'This is my year data: {year_data}')

         Date        Low       Open    Volume       High      Close  \
0  18-11-1999  28.612303  32.546494  62546380  35.765381  31.473534   
1  19-11-1999  28.478184  30.713518  15234146  30.758226  28.880545   
2  22-11-1999  28.657009  29.551144   6577870  31.473534  31.473534   
3  23-11-1999  28.612303  30.400572   5975611  31.205294  28.612303   
4  24-11-1999  28.612303  28.701717   4843231  29.998213  29.372318   

   Adjusted Close  
0       26.929760  
1       24.711119  
2       26.929760  
3       24.481602  
4       25.131901  
            Date         Low        Open   Volume        High       Close  \
5799  06-12-2022  149.910004  152.149994  1172300  153.000000  151.350006   
5800  07-12-2022  151.130005  151.229996  1011100  154.350006  153.729996   
5801  08-12-2022  153.479996  154.419998  1289900  156.990005  156.279999   
5802  09-12-2022  152.679993  155.669998  1020200  156.699997  152.949997   
5803  12-12-2022  153.380005  154.009995   188024  155.399994  155.3

  data['Date']= pd.to_datetime(data['Date'])


In [3]:
####### feature extraction #######

# mean close price per year
annual_mean= year_data['Close'].mean()
# adding a new column of Mean
year_data.insert(loc= 8, column='Mean', value=annual_mean)

# standard deviation per year
annual_std_dev= year_data['Close'].std()
# adding a new column of Standard Deviation
year_data.insert(loc= 9, column='Standard Deviation', value=annual_std_dev)

# significantly low means per year
annual_sig_low= annual_mean - 2*annual_std_dev
# adding a new column of Significantly Low Values
year_data.insert(loc= 10, column='Significanctly Low', value=annual_sig_low)


# significantly high means per year
annual_sig_high= annual_mean + 2*annual_std_dev
# adding a new column of Significantly High Values
year_data.insert(loc= 11, column='Significanctly High', value=annual_sig_high)

# displaying output
year_data

# another important feature is market sentiment which I will add later


Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close,Year,Mean,Standard Deviation,Significanctly Low,Significanctly High
282,2001-01-02,35.094776,38.537197,2261684,38.537197,36.391273,31.137539,2001,23.91235,7.308792,9.294766,38.529934
283,2001-01-03,34.021816,35.139484,4502678,40.414879,40.146637,34.350746,2001,23.91235,7.308792,9.294766,38.529934
284,2001-01-04,39.565453,40.683121,4398388,42.694920,41.666668,35.651340,2001,23.91235,7.308792,9.294766,38.529934
285,2001-01-05,38.268955,40.951359,3277052,41.666668,39.386623,33.700447,2001,23.91235,7.308792,9.294766,38.529934
286,2001-01-08,37.419529,38.805435,2273288,39.878399,38.090130,32.591145,2001,23.91235,7.308792,9.294766,38.529934
...,...,...,...,...,...,...,...,...,...,...,...,...
525,2001-12-24,20.100143,20.371960,1246177,20.457796,20.393419,17.449263,2001,23.91235,7.308792,9.294766,38.529934
526,2001-12-26,20.100143,20.529327,2467051,20.658083,20.100143,17.198328,2001,23.91235,7.308792,9.294766,38.529934
527,2001-12-27,20.035765,20.035765,1909948,20.722462,20.629471,17.651237,2001,23.91235,7.308792,9.294766,38.529934
528,2001-12-28,20.422031,20.743919,1600430,20.879827,20.708155,17.718565,2001,23.91235,7.308792,9.294766,38.529934


In [4]:
################ testing ################

# creating an empty list to store my returns for max drawdown and risk factor
features= []
features2= []


directory= ('../kaggle_dataset')

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Load the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(directory, filename))
        # in case we have any missing data - the dropna method would drop those rows from the dataset
        df= df.dropna()

        # converting the Date column to datetime data type
        df['Date']= pd.to_datetime(df['Date'], errors='coerce')
        # adding a new column of Year
        df['Year']= pd.to_datetime(df['Date']).dt.year
        # chosing a desired year
        desired_year= 2022
        # selecting a specific year
        year_data = df[df['Date'].dt.year == desired_year]

        # print(f'This is my year data: {filename, year_data}')

        prices= year_data['Close'].dropna()

        # Check if DataFrame is empty after dropping missing values
        if year_data.empty:
            print(f"No data in {filename}, skipping...")
            continue  # Skip to the next file

        # Calculate daily returns
        # assigning the percentage change in close price to returns and dropping null values
        ####returns= year_data['Close'].pct_change().dropna()
        #print(filename, returns)

        # cummulative change in returns
        # adding one to show commulative growth/decline
        # cumprod() multiplies each output by the previous
        cum_returns = (1 + year_data['Close'].pct_change().dropna()).cumprod()

        # Calculate the peak and trough points
        peak = cum_returns.max()
        trough = cum_returns.min()

        # Calculate drawdown
        drawdown = ((peak - trough) / peak) * 100
        #print(filename, drawdown)

        # Find maximum drawdown
        max_drawdown = round(drawdown.max(),2)
        #print(f' Company: {filename} Maximum Drawdownd: {max_drawdown}')

        # creating a new column to the dataset for maximum drawdown
        year_data['Maximum Drawdown']= max_drawdown
        max_drawdown_values = max_drawdown.tolist()


        #print(filename, year_data['Maximum Drawdown'])

        features.append(max_drawdown_values)

        # creating a new column to the dataset, where values <= 25 are true (=1) - meaning that the stock is not risky
        # Calculate risk factor
        risk_factor = (max_drawdown <= 25).astype(int)

        # Add 'Risk Factor' column to the year_data DataFrame
        year_data['Risk Factor'] = risk_factor
        risk_factor_values = risk_factor.tolist()

        features2.append(risk_factor_values)


        #print(filename, year_data['Maximum Drawdown'], risk_factor)

output_df = pd.DataFrame({'Maximum Drawdown': features, 'Risk Factor': features2})

# Save the sum DataFrame to a new CSV file
output_df.to_csv('output_file.csv', index=False)

################ testing ################

  df['Date']= pd.to_datetime(df['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Risk Factor'] = risk_factor
  df['Date']= pd.to_datetime(df['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_d

No data in BHI.csv, skipping...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Risk Factor'] = risk_factor
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .

No data in CPICQ.csv, skipping...
No data in CTQ.csv, skipping...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Risk Factor'] = risk_factor
  df['Date']= pd.to_datetime(df['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to

No data in SONC.csv, skipping...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Risk Factor'] = risk_factor
  df['Date']= pd.to_datetime(df['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_data['Maximum Drawdown']= max_drawdown
A value is trying to

In [5]:
####### Training #######

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# opening the output csv file
df= pd.read_csv('output_file.csv')
# extracting featrues and assigning to variables
x= df[['Maximum Drawdown']]
y= df[['Risk Factor']]

# to ensure reproducibility
np.random.seed(42)
# adding noise
 
noise_level = 0
noise = np.random.normal(loc=0, scale=noise_level, size=x.shape)
x_noisy = x + noise

# data split
x_train, x_test, y_train, y_test = train_test_split(x_noisy, y, test_size=0.1)

# introducing the model 
svm_model = SVC(kernel='rbf', gamma='scale')

# fitting data to classifier
svm_model.fit(x_train, y_train)

# predicting y value
y_pred = svm_model.predict(x_test)

# checking accuracy
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1_score1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Accuracy:', accuracy)
print("Precision:", precision)
print("F score:", f1_score1)
print("Recall:", recall)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_test, y_pred)
print(f"SVM Model MCC score: {mcc}")

Accuracy: 0.975609756097561
Precision: 1.0
F score: 0.9333333333333333
Recall: 0.875
SVM Model MCC score: 0.9215555902290176


  y = column_or_1d(y, warn=True)
