# Data Exploration and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Partition Training Data

In [2]:
train_path = "data/train.parquet/"
train_files = sorted(glob(train_path))

In [None]:
train_files

In [4]:
train_df = pd.concat([pd.read_parquet(file) for file in train_files], ignore_index=True)

In [None]:
print("Train DataFrame shape:", train_df.shape)
print("Columns in Train DataFrame:", train_df.columns)

In [6]:
lags_path = "data/lags.parquet/date_id=0"
lags_df = pd.read_parquet(lags_path)

In [7]:
features_path = "data/features.csv"
responders_path = "data/responders.csv" 

features_metadata = pd.read_csv(features_path)
responders_metadata = pd.read_csv(responders_path)

In [None]:
print("Lags DataFrame shape:", lags_df.shape)
print("Features Metadata shape:", features_metadata.shape)
print("Responders Metadata shape:", responders_metadata.shape)

In [None]:
print("Train Data Preview:")
print(train_df.head())

print("\nLagged Responders Preview:")
print(lags_df.head())

print("\nFeatures Metadata Preview:")
print(features_metadata.head())

print("\nResponders Metadata Preview:")
print(responders_metadata.head())

In [10]:
# Example of setting data types for optimization
train_df = train_df.astype({
    'date_id': 'int32',
    'time_id': 'int32',
    'symbol_id': 'int32',
    'weight': 'float32',
    **{f'feature_{i:02}': 'float32' for i in range(79)},
    **{f'responder_{i}': 'float32' for i in range(9)}
})

In [None]:
train_df

## Exploratory Data Analysis (EDA)

#### Step 1: Check Data Summary Statistics

In [None]:
print("Summary Statistics for Train Data:")
print(train_df.describe().transpose())

#### Step 2: Evaluate Feature Distributions

In [None]:
# skipping first five feature cols due to NaN being the main/only value.

plt.figure(figsize=(15, 8))
for i in range(5, 10):
    plt.subplot(2, 3, i - 4)
    sns.histplot(train_df[f'feature_{i:02}'], kde=True, bins=50)
    plt.title(f'Feature {i:02} Distribution')
plt.tight_layout()
plt.show()

#### Step 3: Correlation Analysis

In [None]:
# Correlation matrix for a subset of features and responders
correlation_matrix = train_df[[f'feature_{i:02}' for i in range(5)] + [f'responder_{i}' for i in range(9)]].corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix for Selected Features and Responders")
plt.show()


#### Step 4: Time Series Analysis

In [None]:
# Plot `responder_6` across a sample of time points for a subset of symbols
sample_symbols = train_df['symbol_id'].sample(5, random_state=0).unique()

plt.figure(figsize=(14, 7))
for symbol in sample_symbols:
    symbol_data = train_df[train_df['symbol_id'] == symbol]
    plt.plot(symbol_data['date_id'], symbol_data['responder_6'], label=f'Symbol {symbol}')
plt.title("Responder_6 Over Time for Sample Symbols")
plt.xlabel("Date ID")
plt.ylabel("Responder_6")
plt.legend()
plt.show()


#### Step 5: Investigate `Weight` Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['weight'], bins=50, kde=True)
plt.title("Distribution of Weight")
plt.xlabel("Weight")
plt.ylabel("Frequency")
plt.show()

#### Step 6: Anomaly Detection

In [None]:
clipping_counts = train_df['responder_6'].value_counts().loc[[-5, 5]]
print("Clipping Boundary Counts for Responder_6:\n", clipping_counts)

#### Step 7: Explore Missing Data

In [None]:
missing_values = train_df.isnull().sum()
print("Missing Values by Column:\n", missing_values[missing_values > 0])

## Feature Engineering and Weight Normalization

#### Step 1: Feature Engineering

In [None]:
train_df = train_df.merge(lags_df, on=['date_id', 'symbol_id'], suffixes=('', '_lag1'))

print("Train Data with Lagged Responders:")
print(train_df.head())


In [20]:
for i in range(5, 10):
    feature = f'feature_{i:02}'
    train_df[f'{feature}_rolling_mean'] = train_df.groupby('symbol_id')[feature].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
    train_df[f'{feature}_rolling_std'] = train_df.groupby('symbol_id')[feature].transform(lambda x: x.rolling(window=5, min_periods=1).std())


In [21]:
train_df['feature_5_6_interaction'] = train_df['feature_05'] * train_df['feature_06']

#### Step 2: Weight Normalization

In [None]:
train_df['normalized_weight'] = train_df.groupby('date_id')['weight'].transform(lambda x: x / x.sum())

print("Check Weight Normalization:")
print(train_df.groupby('date_id')['normalized_weight'].sum().head())

In [None]:
print("Columns after Feature Engineering:")
print(train_df.columns)

print("\nSample of Feature-Engineered Data:")
print(train_df.head())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation matrix
correlation_matrix = train_df.corr()

# Focus on correlations with responder_6
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix[['responder_6']], annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation with Responder 6')
plt.show()

In [None]:
# Scatter plots for highly correlated features
high_correlation_features = correlation_matrix['responder_6'].nlargest(6).index.tolist()
plt.figure(figsize=(15, 10))
for i, feature in enumerate(high_correlation_features[1:]):  # Skip responder_6 itself
    plt.subplot(2, 3, i + 1)
    sns.scatterplot(x=train_df[feature], y=train_df['responder_6'], alpha=0.5)
    plt.title(f'Responder 6 vs {feature}')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the training data
train_df = pd.read_parquet('data/train.parquet')

# Check the initial size of the DataFrame
print(f'Initial number of rows: {len(train_df)}')

# Separate features and target
X = train_df[[f'feature_{i:02}' for i in range(79)]]
y = train_df['responder_6']

# Check for NaN values in X and y
print(f'NaN values in X: {X.isna().sum().sum()}')
print(f'NaN values in y: {y.isna().sum()}')

# Impute missing values in features
imputer = SimpleImputer(strategy='mean')  # You can change this strategy if needed
X_imputed = imputer.fit_transform(X)

# Check for remaining NaN values
print(f'NaN values in X after imputation: {pd.DataFrame(X_imputed).isna().sum().sum()}')

# Split the data into training and validation sets (80/20 split)
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Calculate evaluation metrics
r2 = r2_score(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)

print(f'R-squared: {r2:.4f}')
print(f'Mean Absolute Error: {mae:.4f}')


In [124]:
import itertools

def generate_feature_combinations(n):
    # Generate feature names
    feature_names = [f'feature_{str(i).zfill(2)}' for i in range(5, n + 1)]
    
    # Store combinations in a dictionary
    feature_sets = {}
    
    # Generate all combinations of features using a single loop
    index = 1
    for r in range(1, len(feature_names) + 1):
        feature_sets.update({
            f'Set_{index}': list(combo) for index, combo in enumerate(itertools.combinations(feature_names, r), start=index)
        })
        index += len(feature_names) // r  # Adjust index for the next set

    c = {}
    for k, v in feature_sets.items():
        if len(v)>3:
            c[k] = v
    return c


In [125]:
feature_sets = generate_feature_combinations(25)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Sample DataFrame df containing features and target variable
features = ['feature_0', 'feature_1', 'feature_02', 'feature_03', 'feature_04']
target = 'responder_6'

# Define different combinations of features
# feature_sets = {
#     'Set_A': ['feature_05', 'feature_16', 'feature_09'],
#     'Set_B': ['feature_07', 'feature_08'],
#     'Set_C': ['feature_21', 'feature_22', 'feature_23', 'feature_24']
# }

results = {}
for set_name, features in feature_sets.items():
    # Prepare X and y
    X = train_df[features]  # Ensure you drop NaNs based on selected features
    y = train_df.loc[X.index, target]  # Align y with X
    imputer = SimpleImputer(strategy='mean')  # You can change this strategy if needed
    X_imputed = imputer.fit_transform(X)

    # Split the data into training and validation sets (80/20 split)
    X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # Initialize the Linear Regression model
    model = LinearRegression()

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Store results
    results[set_name] = {
        'R-squared': r2_score(y_val, y_pred),
        'Mean Absolute Error': mean_absolute_error(y_val, y_pred)
    }

# Display results
for set_name, metrics in results.items():
    print(f"{set_name}: R-squared = {metrics['R-squared']:.4f}, MAE = {metrics['Mean Absolute Error']:.4f}")


### OLS v1

In [None]:
import statsmodels.api as sm

# Select features (you can adjust based on correlation results)
selected_features = ['feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09']  # Adjust as needed
X = train_df[selected_features]
y = train_df['responder_6']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the OLS model
# ols_model = sm.OLS(y, X).fit()
ols_model = sm.OLS(y, X).fit(method='qr')

# Output the summary
print(ols_model.summary())


In [None]:
# Calculate correlations with responder_6
correlations = train_df.corr()['responder_6'].drop('responder_6')

# Filter features with a high correlation
high_corr_features = correlations[abs(correlations) > 0.3].index.tolist()
print("High Correlation Features with Responder 6:")
print(high_corr_features)


In [None]:
# Initialize a list to store the results
results = []

# Run linear regression for each high correlation feature
for feature in high_corr_features:
    X = train_df[[feature]]
    y = train_df['responder_6']
    
    # Add a constant to the model (intercept)
    X = sm.add_constant(X)
    
    # Fit the OLS model
    ols_model = sm.OLS(y, X).fit()
    
    # Store the summary statistics
    results.append({
        'Feature': feature,
        'Coefficient': ols_model.params[feature],
        'P-value': ols_model.pvalues[feature],
        'R-squared': ols_model.rsquared
    })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)

# Sort by R-squared value in descending order
results_df = results_df.sort_values(by='R-squared', ascending=False)

# Display the results
print("Linear Regression Results:")
print(results_df)

In [None]:
# Visualize the coefficients of the top features
plt.figure(figsize=(12, 6))
sns.barplot(x='Coefficient', y='Feature', data=results_df.head(10))
plt.title('Top Features from Linear Regression against Responder 6')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
plt.show()
