Python version == 3.9.13

# Min Max Normalization

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def NormalizeData(data: pd.DataFrame)->pd.DataFrame:
    """
    Normalize the data using MinMaxScaler.
    
    Args:
        data (pd.DataFrame): The dataframe with the data.
    
    Returns:
        normalised_df (pd.DataFrame): The dataframe with the normalized data.
    """
    
    # Copy the data
    normalised_df = data.copy()
    
    # Get the numeric columns
    numeric_data = normalised_df.select_dtypes(include='number')
    
    # Create the scaler
    scaler = MinMaxScaler()
    
    # Normalize the data
    normalised_df[numeric_data.columns] = scaler.fit_transform(numeric_data)
    
    return normalised_df

In [2]:
data_to_normalise = pd.DataFrame({'a': range(10), 'b': range(0,20,2), 'c': [x/10 for x in range(10)], 'd': ['d1']*5+['d2']*5})
data_to_normalise

Unnamed: 0,a,b,c,d
0,0,0,0.0,d1
1,1,2,0.1,d1
2,2,4,0.2,d1
3,3,6,0.3,d1
4,4,8,0.4,d1
5,5,10,0.5,d2
6,6,12,0.6,d2
7,7,14,0.7,d2
8,8,16,0.8,d2
9,9,18,0.9,d2


In [3]:
normalised_data = NormalizeData(data_to_normalise)
normalised_data 

Unnamed: 0,a,b,c,d
0,0.0,0.0,0.0,d1
1,0.111111,0.111111,0.111111,d1
2,0.222222,0.222222,0.222222,d1
3,0.333333,0.333333,0.333333,d1
4,0.444444,0.444444,0.444444,d1
5,0.555556,0.555556,0.555556,d2
6,0.666667,0.666667,0.666667,d2
7,0.777778,0.777778,0.777778,d2
8,0.888889,0.888889,0.888889,d2
9,1.0,1.0,1.0,d2


In [58]:
# Check the normalised data
normalised_data.describe()

Unnamed: 0,a,b,c
count,10.0,10.0,10.0
mean,0.5,0.5,0.5
std,0.336406,0.336406,0.336406
min,0.0,0.0,0.0
25%,0.25,0.25,0.25
50%,0.5,0.5,0.5
75%,0.75,0.75,0.75
max,1.0,1.0,1.0


# Undersampling

In [4]:
import pandas as pd

def Undersampling(data:pd.DataFrame, target: str)->pd.DataFrame:
    """
    Undersampling the data to balance the classes.
        
    Args:
        data (pd.DataFrame): The dataframe with the data.
        target (str): The name of the target column.
        
    Returns:
        balanced_df (pd.DataFrame): The dataframe with the balanced data.
    """
    
    # Copy the data
    df = data.copy()

    # Get the number of samples for each class
    n_samples = df[target].value_counts().min()
    
    # Get the unique classes
    data_classes = df[target].unique()
    
    # Create a new dataframe
    balanced_df = pd.DataFrame()
    
    # For each class
    for c in data_classes:
        # Get random samples from the class with the same number of the minimum class
        df_class = df[df[target] == c].sample(n=n_samples, random_state=17)
        
        balanced_df = pd.concat([balanced_df, df_class])
    
    # Sort the dataframe
    balanced_df = balanced_df.sort_index()
    
    return balanced_df

In [5]:
unbalanced_data = pd.DataFrame({'a': range(10), 'b': range(10,20), 'y': [0]*2 + [1]*3 + [2]*5})
unbalanced_data

Unnamed: 0,a,b,y
0,0,10,0
1,1,11,0
2,2,12,1
3,3,13,1
4,4,14,1
5,5,15,2
6,6,16,2
7,7,17,2
8,8,18,2
9,9,19,2


In [6]:
undersampled_data = Undersampling(unbalanced_data, 'y')
undersampled_data

Unnamed: 0,a,b,y
0,0,10,0
1,1,11,0
2,2,12,1
4,4,14,1
5,5,15,2
9,9,19,2


In [47]:
# Check the number of samples for each class
undersampled_data['y'].value_counts()

0    2
1    2
2    2
Name: y, dtype: int64

# Oversampling (SMOTE) 

In [50]:
import pandas as pd
from imblearn.over_sampling import SMOTE

def Oversampling(data: pd.DataFrame, target: str, k_neighbors: int = 5)->pd.DataFrame:
    """
    Oversampling the data to balance the classes using SMOTE.
    
    Args:
        data (pd.DataFrame): The dataframe with the data.
        target (str): The name of the target column.
        k_neighbors (int): The number of neighbors to use.
    
    Returns:
        df_balanced (pd.DataFrame): The dataframe with the balanced data.
    """
    
    # Copy the data
    df = data.copy()
    
    # Select the features and the target
    
    
    # Get the features and the target
    X = df.drop(target, axis=1)
    y = df[target]
    
    # Create the SMOTE object
    smote = SMOTE(k_neighbors=k_neighbors, random_state=17)
    
    # Fit and transform the data
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    # Create the new dataframe
    df_balanced = pd.concat([X_balanced, y_balanced], axis=1)
    
    return df_balanced

In [51]:
unbalanced_data = pd.DataFrame({'a': range(10), 'b': range(10,20), 'y': [0]*2 + [1]*3 + [2]*5})
unbalanced_data

Unnamed: 0,a,b,y
0,0,10,0
1,1,11,0
2,2,12,1
3,3,13,1
4,4,14,1
5,5,15,2
6,6,16,2
7,7,17,2
8,8,18,2
9,9,19,2


In [54]:
oversampled_data = Oversampling(unbalanced_data, 'y', k_neighbors=1)
oversampled_data

Unnamed: 0,a,b,y
0,0,10,0
1,1,11,0
2,2,12,1
3,3,13,1
4,4,14,1
5,5,15,2
6,6,16,2
7,7,17,2
8,8,18,2
9,9,19,2


In [55]:
# Check the number of samples for each class
oversampled_data['y'].value_counts()

0    5
1    5
2    5
Name: y, dtype: int64

# Basic Data Split

In [58]:
import pandas as pd

def split_data(data: pd.DataFrame, val_size: float, test_size: float, shuffle: bool = False)->(pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Split the data into training, validation and test sets.
    
    Args:
        data (pd.DataFrame): The dataframe with the data.
        val_size (float): The size of the validation set.
        test_size (float): The size of the test set.
        shuffle (bool): Whether to shuffle the data before splitting.
    
    Returns:
        train (pd.DataFrame): The training set.
        val (pd.DataFrame): The validation set.
        test (pd.DataFrame): The test set.
    """
    
    # Copy the data
    df = data.copy()
    
    if shuffle:
        df = df.sample(frac=1, random_state=17)
    
    # Get the size of the validation set
    val_size = int(len(df) * val_size)
    
    # Get the size of the test set
    test_size = int(len(df) * test_size)
    
    # Split the data
    train = df.iloc[:-val_size-test_size]
    val = df.iloc[-val_size-test_size:-test_size]
    test = df.iloc[-test_size:]
    
    return train, val, test

In [59]:
data_to_split = pd.DataFrame({'a': range(0,10), 'b': range(10,20), 'c': range(20,30)})
data_to_split

Unnamed: 0,a,b,c
0,0,10,20
1,1,11,21
2,2,12,22
3,3,13,23
4,4,14,24
5,5,15,25
6,6,16,26
7,7,17,27
8,8,18,28
9,9,19,29


In [62]:
train, val, test = split_data(data_to_split, val_size=0.2, test_size=0.2)
display(train, val, test)

Unnamed: 0,a,b,c
0,0,10,20
1,1,11,21
2,2,12,22
3,3,13,23
4,4,14,24
5,5,15,25


Unnamed: 0,a,b,c
6,6,16,26
7,7,17,27


Unnamed: 0,a,b,c
8,8,18,28
9,9,19,29


In [64]:
train, val, test = split_data(data_to_split, val_size=0.2, test_size=0.2, shuffle=True)
display(train, val, test)

Unnamed: 0,a,b,c
7,7,17,27
2,2,12,22
5,5,15,25
3,3,13,23
4,4,14,24
0,0,10,20


Unnamed: 0,a,b,c
9,9,19,29
8,8,18,28


Unnamed: 0,a,b,c
6,6,16,26
1,1,11,21


# Temporal Data Split

Sometimes, when we have a dataset with dates, we need to split the data in a way that the training set contains the oldest data and the test set contains the newest data. One way to do this is to sort the data by date and then split it in 3 blocks: the first one will be the training set, the next one will be the validation set and the last one will be the test set. This can be created sorting the data by date and using the previous function to split the data in 3 parts.
An alternative way would be to use the next function. The test set will be selected in the same way as before, but the validation set will be created choosing the rows that are multiple of one number. For example, if we want 20% of the data in the validation set, we will choose the rows that are multiple of 5, i.e. the 5th, the 10th, the 15th, etc. The rest of the data will be the training set.

In [40]:
import pandas as pd

def split_temporal_data(data: pd.DataFrame, val_size: float, test_size: float, date_col: str)->(pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """
    Split the data into train, validation and test sets using a temporal approach.
    
    Args:
        data (pd.DataFrame): The dataframe with the data.
        val_size (float): The size of the validation set.
        test_size (float): The size of the test set.
        date_col (str): The name of the column with the date.
    
    Returns:
        train (pd.DataFrame): The train set.
        val (pd.DataFrame): The validation set.
        test (pd.DataFrame): The test set.
    """
    
    # Copy the data
    df = data.copy()
    
    # Sort the data by date
    df.sort_values(by=date_col, inplace=True)
    
    # Get the number of rows for the validation and test sets
    rows_val = int(len(df) * val_size)
    rows_test = int(len(df) * test_size)
    
    # Obtain the test set and remove it from the original dataframe. Test set is the last rows of the dataframe
    test = df.iloc[-rows_test:]
    df.drop(test.index, inplace=True)
    
    # Obtain the number of jumps to get the validation set. Validation set consists of the rows that are multiples of the number of jumps.
    jumps = len(df)//rows_val
    
    # Obtain the train and validation sets selecting the rows
    val = df[jumps-1::jumps]
    train = df.drop(val.index)
    
    return train, val, test

In [41]:
data_to_split = pd.DataFrame({'date': pd.date_range(start='1/1/2024', end='1/10/2024'), 'b': range(10,20), 'c': range(20,30)}).sample(frac=1, random_state=17)
data_to_split

Unnamed: 0,date,b,c
7,2024-01-08,17,27
2,2024-01-03,12,22
5,2024-01-06,15,25
3,2024-01-04,13,23
4,2024-01-05,14,24
0,2024-01-01,10,20
9,2024-01-10,19,29
8,2024-01-09,18,28
6,2024-01-07,16,26
1,2024-01-02,11,21


In [42]:
train, val, test = split_temporal_data(data_to_split, val_size=0.2, test_size=0.2, date_col='date')
display(train,val,test)

Unnamed: 0,date,b,c
0,2024-01-01,10,20
1,2024-01-02,11,21
2,2024-01-03,12,22
4,2024-01-05,14,24
5,2024-01-06,15,25
6,2024-01-07,16,26


Unnamed: 0,date,b,c
3,2024-01-04,13,23
7,2024-01-08,17,27


Unnamed: 0,date,b,c
8,2024-01-09,18,28
9,2024-01-10,19,29
