In [2]:
import pandas as pd

cm_features = pd.read_csv('/Users/zakotianskyi/PycharmProjects/prediction_competition_2023/data/cm_features_v0.6.csv')
cm_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62501 entries, 0 to 62500
Columns: 148 entries, month_id to ged_sb_y_18
dtypes: float64(120), int64(25), object(3)
memory usage: 70.6+ MB


In [None]:
TARGET_COLUMNS = ['a_ged_sb_y_18', 'b_ged_sb_y_18']

# Explicitly list columns to be dropped before scaling
columns_to_drop = ['month_id',
                   'date',
                   'country_id',
                   'country_id',
                   'ccode',
                   'country_name',
                   '',
                   ]

# drop categorical columns
categorical = ['has_active_riv', 'principal',
               'asymmetric_principal',
               'positional',
               'spatial',
               'ideological',
               'interventionary', 'a_is_major', 'b_is_major']

columns_to_drop.extend(categorical)
columns_to_drop.extend(['a_country_name', 'b_country_name'])

# Also, drop one-hot encoded country identifiers if they are already in the dataframe
columns_to_drop.extend(dyad_df.filter(regex='^country_id_a_').columns.tolist())
columns_to_drop.extend(dyad_df.filter(regex='^country_id_b_').columns.tolist())

columns_to_drop.extend(dyad_df.filter(regex='decay|splag|vdem').columns.tolist())

# Determine numeric columns by excluding the ones to drop from the dataframe
numeric_columns = dyad_df.drop(columns=columns_to_drop).columns.tolist()
list(numeric_columns)

In [None]:
import matplotlib.pyplot as plt
import shutil

PLOT_FIGURES = False


def plot_column_distributions(dataframe, columns, folder='plots', img_size=(4, 3)):
    """
    Plot and save histograms for specified columns in a dataframe.

    Parameters:
    - dataframe: pandas DataFrame containing the data.
    - columns: List of column names to plot.
    - folder: String specifying the directory to save the plots.
    - img_size: Tuple specifying the size of the images.
    """
    import os
    # remove dir if exists
    if os.path.exists(folder):
        shutil.rmtree(folder)
    os.makedirs(folder, exist_ok=True)

    skewed_columns = []
    other_columns = []

    for col in columns:
        if col.startswith('b_'):  # skip b_ columns
            continue

        # Check if the column exists in the dataframe to avoid KeyError
        if col in dataframe.columns:
            col_data = dataframe[col].dropna()

            # Log the minimum and maximum values of the column. Round to 2 decimal places.
            min_val = round(col_data.min(), 2)
            max_val = round(col_data.max(), 2)
            skewness = round(col_data.skew(), 2)

            cols = [col]
            if col.startswith('a_'):
                cols = [col, 'b_' + col[2:]]

            if skewness > 1:
                skewed_columns.extend(cols)
            else:
                other_columns.extend(cols)

            print(f"Column: {col} - Min: {min_val}, Max: {max_val} - Skew: {skewness}")
            if PLOT_FIGURES:
                plt.figure(figsize=img_size)
                col_data.hist(bins=12, alpha=0.75)
                plt.title(f"{col} - Min: {min_val}, Max: {max_val} - Skew: {skewness}")
                plt.xlabel('Value')
                plt.ylabel('Frequency')

                # Save the plot
                plt.savefig(f"{folder}/{col}.png", format='png', dpi=100)
                plt.close()  # Close the figure to free memory
        else:
            print(f"Column {col} not found in dataframe.")

    return skewed_columns, other_columns


# Usage:
skewed_cols, other_columns = plot_column_distributions(cm_features, numeric_columns)

In [None]:
import pandas as pd
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib  # For saving scikit-learn models


def fit_and_transform_data(train_df, test_df, skewed_cols, other_cols):
    """
    Fits scalers to the train_df and transforms both train_df and test_df.
    Saves the scalers for later use.
    
    Parameters:
    - train_df: pandas DataFrame, training data.
    - test_df: pandas DataFrame, testing data.
    - skewed_cols: List of column names that are heavily right-skewed.
    - other_cols: List of column names that are not heavily right-skewed.
    
    Returns:
    - train_df_scaled: Scaled training data.
    - test_df_scaled: Scaled testing data.
    """
    # Define the transformations for each group of features
    transformers = [
        ('quantile', QuantileTransformer(output_distribution='normal'), skewed_cols),
        ('standard', StandardScaler(), other_cols)
    ]

    # Create a ColumnTransformer to apply the scaling
    preprocessor = ColumnTransformer(transformers, remainder='passthrough')

    # Fit and transform the training data
    train_df_scaled = preprocessor.fit_transform(train_df)
    train_df_scaled = pd.DataFrame(train_df_scaled, columns=train_df.columns, index=train_df.index)

    # Transform the testing data using the fitted scalers from the training data
    test_df_scaled = preprocessor.transform(test_df)
    test_df_scaled = pd.DataFrame(test_df_scaled, columns=test_df.columns, index=test_df.index)

    # Save the preprocessor model for inverse transform or further transformations
    joblib.dump(preprocessor, 'data_scaler.joblib')

    return train_df_scaled, test_df_scaled


# Assuming you have the 'train_df' and 'test_df' DataFrames already loaded
train_df_scaled, test_df_scaled = fit_and_transform_data(train_df, test_df, skewed_cols, other_columns)

print(train_df_scaled.head())
print(test_df_scaled.head())