# Data Preprocessing
## Introduction
This notebook focuses on loading the raw data, performing initial data cleaning

In [3]:

import pandas as pd 

raw_data_path = "../data/raw/sovereign_quant_developer_assignment_data.csv"

data = pd.read_csv(raw_data_path)

print(data.head())
print(data.info())



   Unnamed: 0    signal  equity_curve
0           1 -0.023860      0.369910
1           2 -0.010445      0.366245
2           3 -0.007646      0.368794
3           4 -0.000148      0.367254
4           5 -0.011530      0.361481
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49998 entries, 0 to 49997
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    49998 non-null  int64  
 1   signal        49998 non-null  float64
 2   equity_curve  49998 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
None


## Data Cleaning

In [5]:
data['signal'] = pd.to_numeric(data['signal'], errors='coerce')
data['equity_curve'] = pd.to_numeric(data['equity_curve'], errors='coerce')

non_numeric_signals = data['signal'].isna().sum()
non_numeric_equity_curve = data['equity_curve'].isna().sum()

print(f"Non-numeric values in 'signal': {non_numeric_signals}")
print(f"Non-numeric values in 'equity_curve': {non_numeric_equity_curve}")

data = data.dropna()

print(data.head())
print(data.info())


Non-numeric values in 'signal': 0
Non-numeric values in 'equity_curve': 0
   Unnamed: 0    signal  equity_curve
0           1 -0.023860      0.369910
1           2 -0.010445      0.366245
2           3 -0.007646      0.368794
3           4 -0.000148      0.367254
4           5 -0.011530      0.361481
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49998 entries, 0 to 49997
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    49998 non-null  int64  
 1   signal        49998 non-null  float64
 2   equity_curve  49998 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.1 MB
None
   equity_curve  equity_returns
0      0.369910       -0.009910
1      0.366245        0.006960
2      0.368794       -0.004174
3      0.367254       -0.015721
4      0.361481       -0.006456
<class 'pandas.core.frame.DataFrame'>
Index: 49997 entries, 0 to 49996
Data columns (total 4 columns):
 #   Column          Non-

In this analysis, we use the `equity_curve` column to represent the value of the portfolio over time. Each row in our dataset corresponds to a specific time period, with the index representing the passage of time sequentially from the start to the end of our dataset. 


In [6]:
# Shift the returns to align with the signals

data['equity_returns'] = data['equity_curve'].pct_change()
data['equity_returns'] = data['equity_returns'].shift(-1)

data = data.dropna()

print(data[['equity_curve', 'equity_returns']].head())
print(data.info())

   equity_curve  equity_returns
0      0.369910       -0.009910
1      0.366245        0.006960
2      0.368794       -0.004174
3      0.367254       -0.015721
4      0.361481       -0.006456
<class 'pandas.core.frame.DataFrame'>
Index: 49996 entries, 0 to 49995
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      49996 non-null  int64  
 1   signal          49996 non-null  float64
 2   equity_curve    49996 non-null  float64
 3   equity_returns  49996 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 1.9 MB
None
