In [4]:
import pandas as pd

### Part 1. Feature Engineering

The reasoning for the inclusion of each feature is included in part 1 of the document titled "Reasoning and Paper Summary"

Feature 1: trade duration

In [5]:
def tradeDuration(data):
    """
    function that calculates the duration of a trade

    Inputs:
    data: pd.DataFrame dataframe that contains columns "OrderTime" and "ExceutionTime"

    Outputs:
    trade_duration: pd.Series that contains the trade durations
    """
    trade_duration = data['ExecutionTime'] - data['OrderTime']
    return trade_duration

# test example
data = pd.DataFrame({
    'OrderTime': pd.to_datetime(['2024-06-01 09:30:00', '2024-06-01 09:45:00', '2024-06-01 10:00:00']),
    'ExecutionTime': pd.to_datetime(['2024-06-01 09:31:00', '2024-06-01 09:50:00', '2024-06-01 10:10:00'])
})
data['Trade_Duration'] = tradeDuration(data)
print(data)

            OrderTime       ExecutionTime  Trade_Duration
0 2024-06-01 09:30:00 2024-06-01 09:31:00 0 days 00:01:00
1 2024-06-01 09:45:00 2024-06-01 09:50:00 0 days 00:05:00
2 2024-06-01 10:00:00 2024-06-01 10:10:00 0 days 00:10:00


Feature 2: Market Sentiment Score

In [6]:
from textblob import TextBlob

def marketSentiment(data, news_data):
    """
    function that calculates a sentiment score using NLP
    
    inputs:
    data: pd.DataFrame that contains a column "Stock".
    news_data: pd.DataFrame that contains news articles with a column "Text" and a column "Company"

    outputs: 
    pd.Series that contains the scores for market sentiment 
    """
    sentiment_scores = {}
    
    for company in data['Stock']:
        # extract text and calculate score
        company_news = news_data[news_data['Company'] == company]['Text']
        
        company_sentiment = company_news.apply(lambda x: TextBlob(x).sentiment.polarity)
        
        sentiment_scores[company] = company_sentiment.mean()
    
    # map sentiment scores to data
    data['Market_Sentiment'] = data['Stock'].map(sentiment_scores)
    return data['Market_Sentiment']

# test example
news_data = pd.DataFrame({
    'Text': ["Nvidia reports excellent profits.", "Apple is facing a lawsuit", "Nvidia launches new AI product.", "Apple releases new iPhone."],
    'Company': ['Nvidia', 'Apple', 'Nvidia', 'Apple']
})
data = pd.DataFrame({
    'Stock': ['Nvidia', 'Apple']
})

data['Market_Sentiment'] = marketSentiment(data, news_data)
print(data)

    Stock  Market_Sentiment
0  Nvidia          0.568182
1   Apple          0.068182


Feature 3: Order Cancellation Rate

In [7]:
def cancelationRate(data):
    """
    function to calculate the rate of order cancelations

    Inputs:
    data: pd.DataFrame that contains 'TotalOrders' and 'CancelledOrders' columns.
    
    Outputs:
    cancellation_rate: pd.Series that contains the order cancellation rate values.
    """
    cancellation_rate = data['CancelledOrders'] / data['TotalOrders']
    return cancellation_rate

# test example
data = pd.DataFrame({
    'TotalOrders': [100, 200, 150],
    'CancelledOrders': [10, 97, 23]
})
data['Order_Cancellation_Rate'] = cancelationRate(data)
print(data)

   TotalOrders  CancelledOrders  Order_Cancellation_Rate
0          100               10                 0.100000
1          200               97                 0.485000
2          150               23                 0.153333


Feature 4: Order Execution Speed

In [8]:
def executionSpeed(data):
    """
    funcion that calculates the order execution speed
    
    Inputs:
    data: pd.DataFrame containing 'OrderTime' and 'ExecutionTime' columns.
    
    Returns:
    execution_speed: pd.Series that containins the order execution speed values.
    """
    execution_speed = (data['ExecutionTime'] - data['OrderTime']).dt.total_seconds()
    return execution_speed

# test exasmple
data = pd.DataFrame({
    'OrderTime': pd.to_datetime(['2024-06-01 09:30:00', '2024-06-01 09:45:00', '2024-06-01 10:00:00']),
    'ExecutionTime': pd.to_datetime(['2024-06-01 09:31:00', '2024-06-01 09:50:00', '2024-06-01 10:10:00'])
})
data['Order_Execution_Speed'] = executionSpeed(data)
print(data)


            OrderTime       ExecutionTime  Order_Execution_Speed
0 2024-06-01 09:30:00 2024-06-01 09:31:00                   60.0
1 2024-06-01 09:45:00 2024-06-01 09:50:00                  300.0
2 2024-06-01 10:00:00 2024-06-01 10:10:00                  600.0


Feature 5: Order-to-Trade Ratio

In [9]:

def orderToTradeRatio(data):
    """
    function to calculate order to trade ratio

    Inputs:
    data: pd.DataFrame containing 'TotalOrders' and 'TotalTrades' columns.

    Outputs:
    order_to_trade_ratio: pd.Series containing the order-to-trade ratio values.
    """
    order_to_trade_ratio = data['TotalOrders'] / data['TotalTrades']
    return order_to_trade_ratio

# test example
data = pd.DataFrame({
    'TotalOrders': [100, 200, 150, 120, 130, 160, 140],
    'TotalTrades': [80, 190, 140, 110, 125, 155, 135]
})

data['Order_to_Trade_Ratio'] = orderToTradeRatio(data)
print(data)


   TotalOrders  TotalTrades  Order_to_Trade_Ratio
0          100           80              1.250000
1          200          190              1.052632
2          150          140              1.071429
3          120          110              1.090909
4          130          125              1.040000
5          160          155              1.032258
6          140          135              1.037037
