## Sqrt transformation using all the predictors

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define square root transformer
def sqrt_transform(X):
    return np.sqrt(X)

# Load training data
raw_train = pd.read_csv("training.csv")
X = raw_train.drop(['relevance', 'query_id', 'url_id', 'id'], axis=1)
y = raw_train['relevance']

# Apply square root transformation
X_sqrt = sqrt_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_sqrt, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define base models
base_models = [
    ('svc', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression())
]

# Define stacking classifier with a logistic regression meta-model
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
stacking_clf.fit(X_train_scaled, y_train)

# Predict and evaluate on validation set
y_pred_val = stacking_clf.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred_val)

print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.6621


## Log transformation using all the predictors

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define square root transformer
def sqrt_transform(X):
    return np.sqrt(X)
def log_transform(X):
    return np.log1p(X)
# Load training data
raw_train = pd.read_csv("training.csv")
X = raw_train.drop(['relevance', 'query_id', 'url_id', 'id'], axis=1)
y = raw_train['relevance']

# Apply square root transformation
X_log = log_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_log, y, test_size=0.2, random_state=0)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define base models
base_models = [
    ('svc', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression())
]

# Define stacking classifier with a logistic regression meta-model
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
stacking_clf.fit(X_train_scaled, y_train)

# Predict and evaluate on validation set
y_pred_val = stacking_clf.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred_val)

print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.6673


In [4]:
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id'], axis=1)

X_test_log = log_transform(X_test)
X_test_scaled_log = scaler.transform(X_test_log)

y_pred_test_log = stacking_clf.predict(X_test_scaled_log)

# Create a DataFrame for predictions
predictions_df_log = pd.DataFrame({
    'predicted_relevance': y_pred_test_log,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_log.to_csv("predictions_log_ensemble_SVM_RF_Logistic.csv", index=False)

## Log transformation without sig5

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define square root transformer
def sqrt_transform(X):
    return np.sqrt(X)
def log_transform(X):
    return np.log1p(X)
# Load training data
raw_train = pd.read_csv("training.csv")
X = raw_train.drop(['relevance', 'query_id', 'url_id', 'id', 'sig5'], axis=1)
y = raw_train['relevance']

# Apply square root transformation
X_log = log_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_log, y, test_size=0.2, random_state=0)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define base models
base_models = [
    ('svc', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression())
]

# Define stacking classifier with a logistic regression meta-model
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
stacking_clf.fit(X_train_scaled, y_train)

# Predict and evaluate on validation set
y_pred_val = stacking_clf.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred_val)

print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.6662


In [2]:
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id', 'sig5'], axis=1)

X_test_log = log_transform(X_test)
X_test_scaled_log = scaler.transform(X_test_log)

y_pred_test_log = stacking_clf.predict(X_test_scaled_log)

# Create a DataFrame for predictions
predictions_df_log = pd.DataFrame({
    'predicted_relevance': y_pred_test_log,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_log.to_csv("predictions_log_ensemble_SVM_RF_Logistic_remove_sig5.csv", index=False)

## power transformation with sig5

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, PowerTransformer
import numpy as np

transformers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'LogTransformer': lambda X: np.log1p(X),  # Log transformation with offset to handle zero values
    'SquareRootTransformer': lambda X: np.sqrt(X),
    'PolynomialFeatures': PolynomialFeatures(degree=2),  # Include interactions up to degree 2
    'PowerTransformer': PowerTransformer(method='yeo-johnson')  # Box-Cox for non-negative values
}

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


raw_train = pd.read_csv("training.csv")
X = raw_train.drop(['relevance', 'query_id', 'url_id', 'id'], axis=1)
y = raw_train['relevance']
chosen_transformer_name = 'PowerTransformer'  # Example: choose 'SquareRootTransformer'
chosen_transformer = transformers[chosen_transformer_name]

if callable(chosen_transformer):
    X_train_transformed = chosen_transformer(X)
else:
    X_train_transformed = chosen_transformer.fit_transform(X)
scaler = StandardScaler()
X_train_scaled_power = scaler.fit_transform(X_train_transformed)
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled_power, y, test_size=0.1, random_state=0)

# Define base models
base_models = [
    ('svc', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression())
]

# Define stacking classifier with a logistic regression meta-model
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred_val = stacking_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)

print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.6687


In [3]:
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id'], axis=1)

if callable(chosen_transformer):
    X_test_power = chosen_transformer(X_test)
else:
    X_test_power = chosen_transformer.transform(X_test)
X_test_scaled_power = scaler.transform(X_test_power)

y_pred_test_power = stacking_clf.predict(X_test_scaled_power)

# Create a DataFrame for predictions
predictions_df_power = pd.DataFrame({
    'predicted_relevance': y_pred_test_power,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_power.to_csv("predictions_Ensembling_power_svm_RF_LR_.csv", index=False)

## POwer without sig5

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


raw_train = pd.read_csv("training.csv")
X = raw_train.drop(['relevance', 'query_id', 'url_id', 'id', 'sig5'], axis=1)
y = raw_train['relevance']
chosen_transformer_name = 'PowerTransformer'  # Example: choose 'SquareRootTransformer'
chosen_transformer = transformers[chosen_transformer_name]

if callable(chosen_transformer):
    X_train_transformed = chosen_transformer(X)
else:
    X_train_transformed = chosen_transformer.fit_transform(X)
scaler = StandardScaler()
X_train_scaled_power = scaler.fit_transform(X_train_transformed)
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled_power, y, test_size=0.1, random_state=0)

# Define base models
base_models = [
    ('svc', SVC(kernel='rbf', C=10.0, gamma='scale')),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('lr', LogisticRegression())
]

# Define stacking classifier with a logistic regression meta-model
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred_val = stacking_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)

print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.6708


In [5]:
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id', 'sig5'], axis=1)

if callable(chosen_transformer):
    X_test_power = chosen_transformer(X_test)
else:
    X_test_power = chosen_transformer.transform(X_test)
X_test_scaled_power = scaler.transform(X_test_power)

y_pred_test_power = stacking_clf.predict(X_test_scaled_power)

# Create a DataFrame for predictions
predictions_df_power = pd.DataFrame({
    'predicted_relevance': y_pred_test_power,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_power.to_csv("predictions_Ensembling_power_svm_RF_LR_remove_sig5.csv", index=False)