In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.sparse import vstack
from sklearn.linear_model import LogisticRegression  # Meta-learner
from tqdm import tqdm  # Import tqdm for progress tracking

# Load and prepare data
df_domain1 = pd.read_json("/content/domain1_train_data.json", lines=True)
df_domain1['text'] = df_domain1['text'].apply(lambda x: ' '.join(map(str, x)))

df_domain2 = pd.read_json("/content/domain2_train_data.json", lines=True)
df_domain2['text'] = df_domain2['text'].apply(lambda x: ' '.join(map(str, x)))

# Vectorize text
vectorizer = TfidfVectorizer()
X_domain1 = vectorizer.fit_transform(df_domain1['text'])
y_domain1 = df_domain1['label']

X_domain2 = vectorizer.transform(df_domain2['text'])
y_domain2 = df_domain2['label']

# Apply SMOTE to Domain 2
smote = SMOTE(random_state=42)
X_domain2_balanced, y_domain2_balanced = smote.fit_resample(X_domain2, y_domain2)

# Combine the datasets
X_combined = vstack([X_domain1, X_domain2_balanced])
y_combined = pd.concat([pd.Series(y_domain1), pd.Series(y_domain2_balanced)])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True,kernel='linear', C=0.1),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model, collect predictions
train_preds = []
test_preds = []

# Initialize tqdm for progress tracking
pbar = tqdm(total=len(models))

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Get predictions
    train_pred = model.predict_proba(X_train)[:, 1]  # Use predict_proba for meta-learner training
    test_pred = model.predict_proba(X_test)[:, 1]

    # Append predictions
    train_preds.append(train_pred)
    test_preds.append(test_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, model.predict(X_test))

    # Print progress message
    print(f"{name} trained. Accuracy: {accuracy}")

    # Update progress bar
    pbar.update(1)

# Close progress bar
pbar.close()

# Stack predictions as new features for the meta-learner
X_train_meta = np.column_stack(train_preds)
X_test_meta = np.column_stack(test_preds)

# Define and train the meta-learner
meta_learner = LogisticRegression()
meta_learner.fit(X_train_meta, y_train)

# Evaluate the ensemble
final_predictions = meta_learner.predict(X_test_meta)
ensemble_accuracy = accuracy_score(y_test, final_predictions)
print("Ensemble Model Accuracy:", ensemble_accuracy)


 33%|███▎      | 1/3 [00:03<00:06,  3.30s/it]

Logistic Regression trained. Accuracy: 0.8560714285714286


 67%|██████▋   | 2/3 [52:55<31:07, 1867.38s/it]

SVM trained. Accuracy: 0.7992857142857143


100%|██████████| 3/3 [53:53<00:00, 1077.89s/it]

Random Forest trained. Accuracy: 0.9042857142857142
Ensemble Model Accuracy: 0.9146428571428571





In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.sparse import vstack
from tqdm import tqdm

# Assuming your training code is here...

# Load the test dataset
df_test = pd.read_json("/content/test_data.json", lines=True)
# If 'text' contains lists of token IDs, assume they need to be converted to strings
df_test['text'] = df_test['text'].apply(lambda ids: ' '.join(str(id) for id in ids))

# Vectorize the test data using the existing vectorizer
X_test_new = vectorizer.transform(df_test['text'])

# Collect predictions from each base model for the new test data
test_preds_new = []
for name, model in models.items():
    test_pred_new = model.predict_proba(X_test_new)[:, 1]  # assuming binary classification
    test_preds_new.append(test_pred_new)

# Stack these predictions to use as input for the meta-learner
X_test_meta_new = np.column_stack(test_preds_new)

# Use the meta-learner to make final predictions
final_predictions_new = meta_learner.predict(X_test_meta_new)

# Export the predictions to a CSV file
results_df = pd.DataFrame({
    'ID': df_test['id'],
    'Predicted_Label': final_predictions_new
})
results_df.to_csv('predictions.csv', index=False)

print("Predictions exported to predictions.csv.")


Predictions exported to predictions.csv.
