In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, PowerTransformer
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Define transformers
transformers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'LogTransformer': lambda X: np.log1p(X),
    'SquareRootTransformer': lambda X: np.sqrt(X),
    'PowerTransformer': PowerTransformer(method='yeo-johnson')
}

# Load training data
raw_train = pd.read_csv("training.csv")
X_train = raw_train.drop(['relevance', 'query_id', 'url_id', 'id'], axis=1)
y_train = raw_train['relevance']

# Initialize results list
results = []

for name, transformer in transformers.items():
    # Apply the transformer to the training data
    if callable(transformer):
        X_train_transformed = transformer(X_train)
    else:
        X_train_transformed = transformer.fit_transform(X_train)
    
    # Standardize the transformed training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_transformed)

    # Split the data
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=0)

    # Train an SVM model
    svm = SVC(kernel='rbf', C=10.0, gamma='scale')  # Example SVM configuration
    svm.fit(X_train_split, y_train_split)

    # Evaluate the model
    y_pred_split = svm.predict(X_test_split)
    accuracy = accuracy_score(y_test_split, y_pred_split)
    results.append((name, accuracy))
    print(f'{name} - Accuracy: {accuracy:.4f}')

StandardScaler - Accuracy: 0.6668
MinMaxScaler - Accuracy: 0.6668
LogTransformer - Accuracy: 0.6674
SquareRootTransformer - Accuracy: 0.6688
PowerTransformer - Accuracy: 0.6667


In [3]:
chosen_transformer_name = 'SquareRootTransformer'  # Example: choose 'SquareRootTransformer'
chosen_transformer = transformers[chosen_transformer_name]

# Apply the chosen transformer to the training data
if callable(chosen_transformer):
    X_train_sqrt = chosen_transformer(X_train)
else:
    X_train_sqrt = chosen_transformer.fit_transform(X_train)
scaler = StandardScaler()
X_train_scaled_sqrt = scaler.fit_transform(X_train_sqrt)

svm_sqrt = SVC(kernel='rbf', C=10.0, gamma='scale')
svm_sqrt.fit(X_train_scaled_sqrt, y_train)

# Load test data
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id'], axis=1)

if callable(chosen_transformer):
    X_test_sqrt = chosen_transformer(X_test)
else:
    X_test_sqrt = chosen_transformer.transform(X_test)
X_test_scaled_sqrt = scaler.transform(X_test_sqrt)

y_pred_test_sqrt = svm.predict(X_test_scaled_sqrt)

# Create a DataFrame for predictions
predictions_df_sqrt = pd.DataFrame({
    'predicted_relevance': y_pred_test_sqrt,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_sqrt.to_csv("predictions_sqrt_svm.csv", index=False)

In [4]:
chosen_transformer_name = 'LogTransformer'  # Example: choose 'SquareRootTransformer'
chosen_transformer = transformers[chosen_transformer_name]

# Apply the chosen transformer to the training data
if callable(chosen_transformer):
    X_train_log = chosen_transformer(X_train)
else:
    X_train_log = chosen_transformer.fit_transform(X_train)
scaler = StandardScaler()
X_train_scaled_log = scaler.fit_transform(X_train_log)

svm_log = SVC(kernel='rbf', C=10.0, gamma='scale')
svm_log.fit(X_train_scaled_log, y_train)

# Load test data
raw_test = pd.read_csv("test.csv")
X_test = raw_test.drop(['query_id', 'url_id', 'id'], axis=1)

if callable(chosen_transformer):
    X_test_log = chosen_transformer(X_test)
else:
    X_test_log = chosen_transformer.transform(X_test)
X_test_scaled_log = scaler.transform(X_test_log)

y_pred_test_log = svm.predict(X_test_scaled_log)

# Create a DataFrame for predictions
predictions_df_log = pd.DataFrame({
    'predicted_relevance': y_pred_test_log,
    'id': raw_test['id'] # Include the 'id' column to match the original test data format
})

# Save predictions to CSV
predictions_df_log.to_csv("predictions_log_svm.csv", index=False)