In [None]:
import urllib.request
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report

urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/train.jsonl", 
    "train.jsonl"
)
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/test.jsonl", 
    "test.jsonl"
)

with open("train.jsonl", "r") as f:
    train = [json.loads(line) for line in f]
    X_train_full = [item["text"] for item in train]
    y_train_full = [0 if item["class"] == 0 else 1 for item in train]

with open("test.jsonl", "r") as f:
    test = [json.loads(line) for line in f]
    X_test = [item["text"] for item in test]
    test_ids = [item["id"] for item in test]

model = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000, random_state=42)
)

print("Evaluating model performance...\n")

# Cross-validated F1 score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_full, y_train_full, cv=cv, scoring='f1')

print(f"Cross-validated F1 scores: {[round(s, 4) for s in cv_scores]}")
print(f"Mean F1: {round(cv_scores.mean(), 4)} (±{round(cv_scores.std(), 4)})")

# Validation split evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.2, 
    stratify=y_train_full,
    random_state=42
)

model.fit(X_train, y_train)
val_preds = model.predict(X_val)

print("\nValidation set performance:")
print(f"F1 Score: {round(f1_score(y_val, val_preds), 4)}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, val_preds))
print("\nClassification Report:")
print(classification_report(y_val, val_preds, digits=4))

# ------------------------
# 3. Final Predictions
# ------------------------
# Retrain on full data
print("\nTraining final model on full dataset...")
model.fit(X_train_full, y_train_full)

# Generate predictions
test_preds = model.predict(X_test)

# Format results
answer = [{"id": tid, "class": int(pred)} for tid, pred in zip(test_ids, test_preds)]

print("\nFinal predictions:")
print(json.dumps(answer, indent=2))

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import urllib.request
import json
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# ------------------------
# 1. Enhanced Text Preprocessing
# ------------------------
class MalayTextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.replacements = {
            r'\bwk\b': 'week', r'\bskg\b': 'sekarang', r'\bjt\b': 'juta',
            r'\bbyk\b': 'banyak', r'\bgk\b': 'juga', r'\btdk\b': 'tidak',
            r'\bkyk\b': 'seperti', r'\bbsr\b': 'besar', r'\bblh\b': 'boleh'
        }
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [self._normalize_text(text) for text in X]
    
    def _normalize_text(self, text):
        # Remove emojis
        text = emoji.replace_emoji(text, replace=' ')
        # Replace internet slang
        text = text.lower()
        for pattern, replacement in self.replacements.items():
            text = re.sub(pattern, replacement, text)
        # Normalize repeated vowels
        text = re.sub(r'([aeiou])\1{2,}', r'\1\1', text)
        # Remove non-Malay characters
        text = re.sub(r'[^\w\s]', ' ', text)
        # Collapse whitespace
        return re.sub(r'\s+', ' ', text).strip()

# ------------------------
# 2. Download and load data
# ------------------------
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/train.jsonl", 
    "train.jsonl"
)
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/test.jsonl", 
    "test.jsonl"
)

with open("train.jsonl", "r") as f:
    train = [json.loads(line) for line in f]
    X_train = [item["text"] for item in train]
    y_train = [0 if item["class"] == 0 else 1 for item in train]

with open("test.jsonl", "r") as f:
    test = [json.loads(line) for line in f]
    test_ids = [item["id"] for item in test]

# ------------------------
# 3. Optimized Feature Engineering
# ------------------------
pipeline = make_pipeline(
    MalayTextNormalizer(),
    TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5)),
    LogisticRegression(class_weight='balanced', solver='liblinear')
)

# ------------------------
# 4. Precision Parameter Tuning
# ------------------------
param_grid = {
    'tfidfvectorizer__max_df': [0.75, 0.8],
    'tfidfvectorizer__min_df': [1, 2],
    'tfidfvectorizer__sublinear_tf': [True],
    'logisticregression__C': np.logspace(-2, 1, 10),
    'logisticregression__penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best F1: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_

# ------------------------
# 5. Final Predictions
# ------------------------
# Ensemble predictions using voting
test_preds_proba = best_model.predict_proba([item["text"] for item in test])
test_preds = (test_preds_proba[:, 1] > 0.43).astype(int)  # Threshold tuning

answer = [{"id": tid, "class": int(pred)} for tid, pred in zip(test_ids, test_preds)]

print(answer)

In [None]:
import urllib.request
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin

# ------------------------
# 1. Text Preprocessing
# ------------------------
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [self._clean_text(text) for text in X]
    
    def _clean_text(self, text):
        # Remove special characters and emojis
        text = re.sub(r'[^\w\s]', ' ', text)
        # Normalize repeated characters (e.g., jaaa -> jaa)
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        return text.lower().strip()

# ------------------------
# 2. Download and load data
# ------------------------
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/train.jsonl", 
    "train.jsonl"
)
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/test.jsonl", 
    "test.jsonl"
)

with open("train.jsonl", "r") as f:
    train = [json.loads(line) for line in f]
    X_train_full = [item["text"] for item in train]
    y_train_full = [0 if item["class"] == 0 else 1 for item in train]

with open("test.jsonl", "r") as f:
    test = [json.loads(line) for line in f]
    test_ids = [item["id"] for item in test]

# ------------------------
# 3. Optimized Pipeline
# ------------------------
pipeline = make_pipeline(
    TextPreprocessor(),
    TfidfVectorizer(ngram_range=(1, 2)),  # Include bigrams
    LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
)

# ------------------------
# 4. Hyperparameter Tuning
# ------------------------
param_grid = {
    'tfidfvectorizer__max_df': [0.85, 0.9],
    'tfidfvectorizer__min_df': [2, 3],
    'logisticregression__C': [0.5, 1, 2],
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1',
    n_jobs=-1
)

print("Performing grid search...")
grid_search.fit(X_train_full, y_train_full)

print("\nBest parameters:")
print(grid_search.best_params_)
print(f"Best F1: {grid_search.best_score_:.4f}")

# ------------------------
# 5. Final Model Training
# ------------------------
best_model = grid_search.best_estimator_

# Cross-validation with best model
cv_scores = cross_val_score(
    best_model, 
    X_train_full, 
    y_train_full, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1'
)

print("\nOptimized model cross-validation:")
print(f"F1 scores: {[round(s, 4) for s in cv_scores]}")
print(f"Mean F1: {round(np.mean(cv_scores), 4)} (±{round(np.std(cv_scores), 4)})")

# Train final model
best_model.fit(X_train_full, y_train_full)

# ------------------------
# 6. Generate Predictions
# ------------------------
test_preds = best_model.predict([item["text"] for item in test])

answer = [{"id": tid, "class": int(pred)} for tid, pred in zip(test_ids, test_preds)]

# Submit answer
print(answer)