In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
import numpy as np

import plotly.express as px  # For interactive plots
import matplotlib.pyplot as plt  # For additional plotting
import seaborn as sns  # For enhanced visuals with Matplotlib

# Dataframe

In [None]:
df= pd.read_csv('~/Small-Cap-Scout/raw_data/merged_data.csv')


# Check the initial structure of the data

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,cik,date,Assets,AssetsCurrent,Cash,AssetsNoncurrent,Liabilities,LiabilitiesCurrent,LiabilitiesNoncurrent,...,year,GDP,interest_rate,unemployment_rate,median_cpi,Ticker,Monthly Avg Market Cap,Monthly_Avg_Close,Monthly_Volume_Total,Monthly_Volatility
0,0,2098,2017-03-31,100204000.0,67589000.0,6175000.0,32615000.0,53601000.0,8935000.0,44666000.0,...,2017,1.568188,0.79,4.5,1.914384,ACU,85216690.0,25.470456,135033.333333,0.230126
1,1,2098,2017-06-30,109362000.0,76345000.0,5674000.0,33017000.0,60014000.0,12713000.0,47301000.0,...,2017,3.279595,1.04,4.4,1.896834,ACU,92655590.0,27.592711,161800.0,0.186883
2,2,2098,2017-09-30,110938000.0,77847000.0,7021000.0,33091000.0,60218000.0,13983000.0,46235000.0,...,2017,4.237048,1.15,4.2,2.870257,ACU,88992800.0,26.41621,103233.333333,0.255606
3,3,2098,2017-12-31,114729806.0,77817645.0,9338269.0,36912161.0,64791813.0,16782971.0,48008842.0,...,2017,3.816009,1.3,4.1,2.955747,ACU,79190700.0,23.377893,152933.333333,0.20984
4,4,2098,2018-03-31,107246000.0,70217000.0,1065000.0,37029000.0,56754000.0,11153000.0,45601000.0,...,2018,1.777373,1.51,4.1,1.9619,ACU,78046430.0,22.93848,146166.666667,0.251369


In [4]:
df.info()  # Get an overview of the data types and non-null counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24266 entries, 0 to 24265
Data columns (total 56 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   Unnamed: 0                                                       24266 non-null  int64  
 1   cik                                                              24266 non-null  int64  
 2   date                                                             24266 non-null  object 
 3   Assets                                                           24266 non-null  float64
 4   AssetsCurrent                                                    24266 non-null  float64
 5   Cash                                                             24266 non-null  float64
 6   AssetsNoncurrent                                                 24266 non-null  float64
 7   Liabilities                             

In [5]:
df.describe(include='all')  # Summary statistics for numerical and categorical data

Unnamed: 0.1,Unnamed: 0,cik,date,Assets,AssetsCurrent,Cash,AssetsNoncurrent,Liabilities,LiabilitiesCurrent,LiabilitiesNoncurrent,...,year,GDP,interest_rate,unemployment_rate,median_cpi,Ticker,Monthly Avg Market Cap,Monthly_Avg_Close,Monthly_Volume_Total,Monthly_Volatility
count,24266.0,24266.0,24266,24266.0,24266.0,24266.0,24266.0,24266.0,24266.0,24266.0,...,24266.0,24266.0,24266.0,24266.0,24266.0,24266,24266.0,24266.0,24266.0,24263.0
unique,,,28,,,,,,,,...,,,,,,972,,,,
top,,,2017-06-30,,,,,,,,...,,,,,,ACU,,,,
freq,,,884,,,,,,,,...,,,,,,28,,,,
mean,12132.5,921771.0,,29055870000.0,22866000000.0,1260872000.0,6031649000.0,24025520000.0,19053440000.0,4895993000.0,...,2019.968433,2.636835,1.768607,4.531703,3.578773,,38413690000.0,2167.109,37243670.0,0.275854
std,7005.135152,506135.8,,188498400000.0,186694700000.0,9188435000.0,25120580000.0,170370100000.0,159464200000.0,20717940000.0,...,1.996802,10.900845,1.653036,1.644833,1.808526,,1334975000000.0,121938.4,153253100.0,0.181517
min,0.0,2098.0,,446000.0,15301.0,-27475000.0,-482203000.0,57000.0,-31370000.0,-31349250.0,...,2017.0,-44.340164,0.07,3.5,1.092895,,2342323.0,0.2635072,0.0,0.015726
25%,6066.25,731766.0,,319507000.0,161514000.0,26326250.0,5265897.0,132331600.0,57250500.0,8248950.0,...,2018.0,1.463561,0.2,3.6,2.333723,,444005300.0,13.94495,2111275.0,0.156328
50%,12132.5,1000753.0,,1940510000.0,711549500.0,119341000.0,267916000.0,1120182000.0,372941000.0,248184500.0,...,2020.0,3.152954,1.3,3.9,2.955747,,1782443000.0,34.14776,8230000.0,0.226001
75%,18198.75,1337298.0,,8977338000.0,3587863000.0,452000000.0,2448632000.0,6238900000.0,2556718000.0,2202425000.0,...,2022.0,4.237048,2.38,4.4,4.624751,,6322972000.0,72.88468,26383020.0,0.342499


In [6]:
# Check for duplicated rows, bitte
print(f"Number of duplicated rows: {df.duplicated().sum()}")

Number of duplicated rows: 0


In [7]:
# Counting the number of NaN for each column
df.isnull().sum().sort_values(ascending=False)

stprba                                                             4326
afs                                                                3380
nciks                                                              3380
Monthly_Volatility                                                    3
Monthly Avg Market Cap                                                0
Ticker                                                                0
NetCashProvidedByUsedInOperatingActivitiesContinuingOperations        0
NetCashProvidedByUsedInFinancingActivitiesContinuingOperations        0
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations        0
NetCashProvidedByUsedInOperatingActivities                            0
NetCashProvidedByUsedInFinancingActivities                            0
NetCashProvidedByUsedInInvestingActivities                            0
CashProvidedByUsedInOperatingActivitiesDiscontinuedOperations         0
CashProvidedByUsedInInvestingActivitiesDiscontinuedOperations   

In [8]:
# Counting the percentage of NaN for each column
df.isnull().sum().sort_values(ascending=False) / len(df) #NaN percentage for each column

stprba                                                             0.178274
afs                                                                0.139290
nciks                                                              0.139290
Monthly_Volatility                                                 0.000124
Monthly Avg Market Cap                                             0.000000
Ticker                                                             0.000000
NetCashProvidedByUsedInOperatingActivitiesContinuingOperations     0.000000
NetCashProvidedByUsedInFinancingActivitiesContinuingOperations     0.000000
NetCashProvidedByUsedInInvestingActivitiesContinuingOperations     0.000000
NetCashProvidedByUsedInOperatingActivities                         0.000000
NetCashProvidedByUsedInFinancingActivities                         0.000000
NetCashProvidedByUsedInInvestingActivities                         0.000000
CashProvidedByUsedInOperatingActivitiesDiscontinuedOperations      0.000000
CashProvided

# Visualization

In [9]:
# Plotting box plots for significant features to detect outliers
significant_features = ['Revenues', 'Assets', 'Cash', 'Liabilities', 'interest_rate', 'unemployment_rate']  # Replace with your actual feature names

for feature in significant_features:
    fig = px.box(df, y=feature, title=f"Box plot of {feature} - Detecting Outliers")
    fig.show()

# Train/Test split + dropping features

In [10]:
# Step 1: Dropping irrelevant columns and splitting the data by company ticker.
# Achtung! We'll start by dropping columns that we don't need. For now just n_cik, but there may be more, pass auf!
# Right now, we’re just simulating the drop.
df = df.drop(columns=['Unnamed: 0'])  # Tschüüüüss!

In [11]:
# Step 2: Splitting the data by company ticker.
# First, let’s set up our train/test split while grouping by 'Ticker'.
# The holy grail of machine learning: keeping our test data untouched to avoid data leakage!

# Creating a custom function for the group split
def group_train_test_split(X, y=None, test_size=0.2, random_state=None):
    # We split by groups (company ticker) while keeping the data structure intact.
    unique_groups = X['Ticker'].unique()
    train_groups, test_groups = train_test_split(unique_groups, test_size=test_size, random_state=random_state)
    train_data = X[X['Ticker'].isin(train_groups)]
    test_data = X[X['Ticker'].isin(test_groups)]
    return train_data, test_data

# Muy bien! Now apply the custom train_test_split function:
train_data, test_data = group_train_test_split(df, test_size=0.2, random_state=42)

y_train = train_data['Monthly Avg Market Cap']
X_train = train_data.drop('Monthly Avg Market Cap', axis = 1)

y_test = test_data['Monthly Avg Market Cap']
X_test = test_data.drop('Monthly Avg Market Cap', axis = 1)

# Preprocessing pipeline setup

In [12]:
# Step 3: Setting up the preprocessing pipeline.

# We'll need to identify the columns for different types of preprocessing. Supergeil!
# Function to identify numerical and categorical features
def identify_feature_types(df):
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()

    return numerical_features, categorical_features

# Automatically identify feature types
numerical_features, categorical_features = identify_feature_types(df)

# Preprocessing for numerical data: RobustScaler to make our numbers más fuerte.
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # We handle NaNs by taking the median
    ('scaler', RobustScaler())  # Scaling the features using RobustScaler
])

# Preprocessing for categorical data: OneHotEncoder to give each category its own columm...
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Let's fill missing categories with the most common one
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Ignore unknown categories
])

# Combine the numerical and categorical transformers into one big ColumnTransformer.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


# Final preprocessing pipeline

In [13]:
# Step 4: Create the final preprocessing pipeline.

# Our pipeline does two things: applies the preprocessing steps and then returns the preprocessed data.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Ya'stá! Now we can fit and transform our train data.
X_train_processed = pipeline.fit_transform(train_data)
X_test_processed = pipeline.transform(test_data)

# If you're curious to see what this all looks like:
print(pd.DataFrame(X_train_processed).head())

# BAM! You'd use X_train_processed and X_test_processed to feed into your models.

        0         1         2         3         4         5         6    \
0 -1.525305 -0.222987 -0.194591 -0.274200 -0.105040 -0.171233 -0.154879   
1 -1.525305 -0.221836 -0.191891 -0.275396 -0.104873 -0.170119 -0.153234   
2 -1.525305 -0.221638 -0.191428 -0.272180 -0.104842 -0.170083 -0.152682   
3 -1.525305 -0.221162 -0.191437 -0.266648 -0.103253 -0.169288 -0.151463   
4 -1.525305 -0.222102 -0.193781 -0.286400 -0.103204 -0.170685 -0.153913   

        7         8         9    ...  985  986  987  988  989  990  991  992  \
0 -0.095681 -0.254458 -0.253903  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1 -0.094427 -0.253244 -0.252680  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
2 -0.094934 -0.252636 -0.252069  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
3 -0.094090 -0.252982 -0.252417  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
4 -0.095236 -0.252737 -0.252170  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

   993  994  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0 

In [19]:
pd.DataFrame(X_train_processed).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,19410.0,-0.053026,0.831488,-1.525305,-0.362918,0.0,0.637082,1.384457
1,19410.0,3.766722,26.210195,-0.235514,-0.194276,0.0,0.805724,496.673838
2,19410.0,7.780994,63.934255,-0.215430,-0.163587,0.0,0.836413,1219.341171
3,19410.0,2.858117,23.966846,-0.354540,-0.224113,0.0,0.775887,718.366172
4,19410.0,2.262321,9.906302,-0.319158,-0.116057,0.0,0.883943,208.631503
...,...,...,...,...,...,...,...,...
990,19410.0,0.001443,0.037955,0.000000,0.000000,0.0,0.000000,1.000000
991,19410.0,0.001443,0.037955,0.000000,0.000000,0.0,0.000000,1.000000
992,19410.0,0.001030,0.032084,0.000000,0.000000,0.0,0.000000,1.000000
993,19410.0,0.000876,0.029582,0.000000,0.000000,0.0,0.000000,1.000000


In [None]:
X_train_processed_df.to_csv('~/Small-Cap-Scout/raw_data/X_train_processed.csv', index=False)
X_test_processed_df.to_csv('~/Small-Cap-Scout/raw_data/X_test_processed.csv', index=False)