In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from matplotlib.patches import Rectangle
plt.style.use('seaborn-deep')
from scipy.stats import norm
from matplotlib.pyplot import figure
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

os.chdir('/Users/denislukanov/Desktop/webb_traders_task/test_data_1_1_1')
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [None]:
market_data = pd.read_csv('market_data.csv')
#Calculate mid_price
market_data['mid_price']=(market_data['bid_price']+market_data['ask_price'])/2

In [None]:
market_data

In [None]:
#Checking for null values
print(market_data.isnull().any())

In [None]:
#Calculating dollar quantity
orders = pd.read_csv('orders.csv')
orders['dollar_quantity']=orders['price']*orders['quantity']
orders

In [None]:
#Checking for null values
print(orders.isnull().any())

In [None]:
#Simple EDA of order's status
orders['order_status'].value_counts(normalize=True)

In [None]:
#Filtering out unfilled orders
filtered_df_fill = orders[(orders['order_status']=='CREATE')&(orders['order_status'].shift(periods=1)=='FILL')]
filtered_df_fill

In [None]:
#Changing side of shor-sell
filtered_df_fill.loc[filtered_df_fill['side'] == 'SELL', 'dollar_quantity'] *= -1

### I/ Market Risk

a) Define what kind of metrics you would need to monitor the market risk of the cash equity strategy

b) Using the data provided in file 1 and file 2, implement in Python one of the relevant metric you suggested

c) Still using the Python code, how would you make your code more generic to include a wider range of risk metrics

d) Run the calculation for your risk metric of choice and comment

### The idea main behind the solution for the 1st problem is  the following: 
   A) Calculate net exposure for each symbol
   
   B) Apply chosen risk metric for the net exposure
   
   C) Make dynamic calculation dollar value on a symbol level on a filled-orders dataset
   
   D) The calculated risk exposure should be monitored vs prestablished risk limits and or Risk-Policy guidelines

In [None]:
# Calculation of net exposure
net_positions = {}
for index, row in filtered_df_fill.iterrows():
    symbol = row['symbol']
    quantity = row['dollar_quantity']
    if symbol not in net_positions:
        net_positions[symbol] = 0
    net_positions[symbol] += quantity
    filtered_df_fill.at[index, 'net_position'] = net_positions[symbol]

### Motivation for choosing CVaR risk metric

Value at Risk (VaR) and Conditional Value at Risk (CVaR) are both risk measures used to estimate the potential losses of an investment or portfolio over a specific time frame.  

There are some differences between the two measures:

1) VaR measures the potential loss of an investment or portfolio at a specific confidence level, whereas CVaR measures the expected loss beyond the VaR breakpoint.

2) CVaR is a coherent risk measure, which means it is suitable for portfolio optimization and risk management. VaR, on the other hand, is not a coherent risk measure.

Unlike VaR, which only considers the worst loss at a specific confidence level, CVaR focuses on the average loss in the worst-case scenarios, providing a more comprehensive view of the tail risk.
CVaR is less sensitive to extreme outliers compared to VaR, making it a more robust risk measure for assets with non-normal distributions and fat tails.

Ultimately, I choose CVaR because it provides a more comprehensive view of the tail risk and is more suitable for  risk management.

In [None]:
def calculate_cvar(mid_prices, alpha):
    """
    function that calculates VaR and CVaR the Conditional Value at Risk (CVar) of a mid-price for a given significance level (alpha).
    """
    # calculate daily returns from mid prices
    daily_returns = mid_prices.pct_change()
    
    # find VaR (Value at Risk) at specified level
    var = daily_returns.quantile(q=alpha)
    
    # filter returns below VaR, if any
    daily_returns_cvar = daily_returns[daily_returns < var]
    
    # if the returns_cvar is empty 
    if daily_returns_cvar.empty or daily_returns_cvar.var() == 0:
        return 0
    
    # Calculate CVaR (Conditional Value at Risk) and return
    cvar = - daily_returns_cvar.mean()
    return cvar


In [None]:
# # Calculation test for one symbol
# for i in range(len(filtered_df_fill[filtered_df_fill['symbol']=="ACN"])):
#     print(market_data[(market_data['timestamp'] < filtered_df_fill['timestamp'].iloc[i]) & (market_data['symbol'] == 'ACN')].tail(100))

In [None]:
#Heavy part of the algorithm in terms of calcualtion speed/time

cvar_data=[]
"""
Dynamic market risk assesment with a window given last 100 observations of mid_price before the timestamp of actual filled orders
Exposure risk calculation is done on securities level because securities orders are filled on this level without taken into consideration portfolio diversification
Window size and alpha are hyperparamenters that could be finr tuned manually
"""

for symbol in tqdm(filtered_df_fill['symbol'].unique()):
    for i, row in filtered_df_fill[filtered_df_fill['symbol'] == symbol].iterrows():
        last_10 = market_data[(market_data['timestamp'] < row['timestamp']) & (market_data['symbol'] == symbol)].tail(100)
        net_position = row['net_position']
        cvar = calculate_cvar(last_10['mid_price'], 0.05) * net_position * 100
        filtered_df_fill.loc[i, 'CVar'] = cvar
        cvar_data.append(cvar)
        print(symbol)
        print(last_10['timestamp'].values[-1]) 
        print(cvar)

In [None]:
# adding calculated CVaR values into the main dataset
filtered_df_fill['CVar']=cvar_data

In [None]:
filtered_df_fill

### II/ Market Surveillance

 a) You now want to know whether the algo is misbehaving (from a market regulation perspective). What kind of metrics would you look at?

 b) In a similar fashion as above, implement in Python one of those metrics

 c) Run the calculation for your metric of choice and comment


### Theorectical inference: the main groups of activitivies that could potentially fall under financial market misbehaviour and/or manipulation can be aggregated in the following main groups:  
#### Abnormal Trading Activity: 
Monitoring for abnormal trading activity, such as sudden price movements or abnormal trading patterns, can help detect potential market manipulation. Statistical analysis, pattern recognition, and anomaly detection algorithms are used to identify potential manipulative behavior. 
#### Volume: 
Monitoring the trading volume of a security or market can help identify potential market manipulation. Unusually high or low trading volumes may indicate manipulative behavior
#### Order Book Imbalances: 
Monitoring the order book imbalances, which is the difference between the number of buy and sell orders at a specific price level, can help identify potential market manipulation. Large imbalances may indicate manipulative behavior. 
#### Price Movements: 
Monitoring the price movements of a security or market can help identify potential market manipulation. Sudden and significant price movements that are not aligned with market fundamentals may indicate manipulative behavior. 
#### Correlations: 
Monitoring the correlations between different securities or markets can help identify potential market manipulation. Unusual correlations or patterns may indicate manipulative behavior 

It should be noted that these quantitative measures are not definitive proof of market manipulation, but they can provide valuable insights into potential manipulative behavior. 

### The idea main behind the solution for the 2nd problem is  the following: 
   A) I choose price and quantity data to analyse for potential manipulation
   
   B) Visualise chosen on a) data for each sympol
   
   C) Apply Isolation forest algo to detect potential manipulation trades
   
   D) Visualise and map potentially manipulative reading activity

In [None]:
#For this problem I create a new link to the filtered dataset 

df=filtered_df_fill

In [None]:
#First thing is to visualise the data on a symbol level
import matplotlib.pyplot as plt

symbols = df['symbol'].unique()

# Define a list of colors to use for the plots
colors = ['blue', 'green', 'red', 'orange', 'purple']

for i, symbol in enumerate(symbols):
    symbol_data = df[df['symbol'] == symbol]
    
    # Plot for Quantity
    plt.figure()
    plt.plot(symbol_data['date'], symbol_data['quantity'], color=colors[2 % 4])
    plt.xlabel('Date')
    plt.ylabel('Quantity')
    plt.title('{} Quantity'.format(symbol))
    plt.show()
    
    # Plot for Price
    plt.figure()
    plt.plot(symbol_data['date'], symbol_data['price'], color=colors[1%4])
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.title('{} Price'.format(symbol))
    plt.show()

In [None]:
# Step 1: Perform Anomaly Detection
symbols = df['symbol'].unique()
anomalies = pd.DataFrame(columns=df.columns)
for symbol in symbols:
    symbol_df = df[df['symbol'] == symbol]
    # Normalize the data
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(symbol_df[['price', 'quantity']])
    # Apply Isolation Forest algorithm
    clf = IsolationForest(contamination=0.05)
    clf.fit(normalized_data)
    anomaly_scores = clf.decision_function(normalized_data)
    # Identify anomalies
    symbol_anomalies = symbol_df[anomaly_scores < 0]
    anomalies = pd.concat([anomalies, symbol_anomalies])

# Step 2: Create Separate Plots for Each Symbol
for symbol in symbols:
    symbol_df = df[df['symbol'] == symbol]
    symbol_anomalies = anomalies[anomalies['symbol'] == symbol]
    # Plot Price
    plt.plot(symbol_df['date'], symbol_df['price'], label='Price')
    plt.scatter(symbol_anomalies['date'], symbol_anomalies['price'], color='red', label='Anomaly')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.title(f'{symbol} Stock Prices')
    plt.legend()
    plt.show()
    # Plot Quantity
    plt.plot(symbol_df['date'], symbol_df['quantity'], label='Quantity')
    plt.scatter(symbol_anomalies['date'], symbol_anomalies['quantity'], color='red', label='Anomaly')
    plt.xlabel('Date')
    plt.ylabel('Quantity')
    plt.title(f'{symbol} Stock Quantities')
    plt.legend()
    plt.show()

### From the above calculations and visualisation we can derive following conclusions:
 
  a) Algo is good in capturing extreme values for quantity (we observe more extreme movemets in quantity vs price)
  
  b) Algo is good at capturing "turning points" points in price (price moves in a more smooth maner than quantity)
  
  c) By mapping turning points in price with extreme quantity levels we get a higher probabilty of detecting manipulative trades on the particulat symbol 

### III/ Derivatives
 f) Let's assume that instead of a cash equity strategy, the trader decides to trade a directional alpha on options. 

What additional risk metrics would you suggest in addition to question a)

### Solution for problem 3:
As soon as we are adding derivatives to our portfolio we need to start tracking at least two characteristics:

A) Delta of the particular symbol that charaterises change in option price vs price for symbol (mathematically - 1st derivative)

B) Gamma that charaterises change of the option's delta with vs price for symbol (mathematically - 2nd derivative)

C) Keep constant track of each metric given our Risk appetite