In [5]:
auth_token = "maio_cf75a54b_d24c_4ca3_b9c2_4b90d9d61d6f"
import requests

url = "https://competitions.aiolympiad.my/api/maio_2025/maio_2025_baku_or_pasar"

def post_answer(data: dict):
    response = requests.post(url=url, json=data, headers={"X-API-Key": auth_token})
    if response.status_code == 200:
        return response.json()
    else:
        return f"Failed to submit, status code is {response.status_code}\n{response.text}"

train.jsonl: https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/train.jsonl

test.jsonl: https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/

Using the data in `train.jsonl`, use any classifier to generate class predictions of 0 and 1 for the text in `test.jsonl`. Write your output to this format below, and submit it to our competition server using the provided `post_answer` function.

```python
answer = [
  {'id': 0, 'class': 0},
  {'id': 1, 'class': 0},
  {'id': 2, 'class': 0},
  {'id': 3, 'class': 0},
  {'id': 4, 'class': 0},
  {'id': 5, 'class': 0},
  ...
]

post_answer({"solution": answer})
```

Make sure your answer has similar format to this sample answer: 

```python
# Note: this sample answer if submitted will return a score of 0 due to how F1 score works! Do you know why?
sample_answer = [{"id": i, "class": 0} for i in range(len(test))]
```

Remember to only produce predictions that are either 0 or 1!

This challenge will be graded primarily via API submission. Scoring as follows:

- Up to 10 pts for model performance, F1 score X 10. F1 score will be calculated via the autograder. 10 submissions allowed. Last submission will be used for scoring.
- Please submit your notebook as well! Random notebooks will be inspected to verify API submission results. In addition, partial credit may be granted for incomplete work at discretion

In [None]:
#my work
import urllib.request
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, confusion_matrix, classification_report

urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/train.jsonl", 
    "train.jsonl"
)
urllib.request.urlretrieve(
    "https://storage.googleapis.com/aiolympiadmy/baku-or-pasar/test.jsonl", 
    "test.jsonl"
)

with open("train.jsonl", "r") as f:
    train = [json.loads(line) for line in f]
    X_train_full = [item["text"] for item in train]
    y_train_full = [0 if item["class"] == 0 else 1 for item in train]

with open("test.jsonl", "r") as f:
    test = [json.loads(line) for line in f]
    X_test = [item["text"] for item in test]
    test_ids = [item["id"] for item in test]

model = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000, random_state=42)
)

print("Evaluating model performance...\n")

# Cross-validated F1 score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_full, y_train_full, cv=cv, scoring='f1')

print(f"Cross-validated F1 scores: {[round(s, 4) for s in cv_scores]}")
print(f"Mean F1: {round(cv_scores.mean(), 4)} (±{round(cv_scores.std(), 4)})")

# Validation split evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.2, 
    stratify=y_train_full,
    random_state=42
)

model.fit(X_train, y_train)
val_preds = model.predict(X_val)

print("\nValidation set performance:")
print(f"F1 Score: {round(f1_score(y_val, val_preds), 4)}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, val_preds))
print("\nClassification Report:")
print(classification_report(y_val, val_preds, digits=4))

# ------------------------
# 3. Final Predictions
# ------------------------
# Retrain on full data
print("\nTraining final model on full dataset...")
model.fit(X_train_full, y_train_full)

# Generate predictions
test_preds = model.predict(X_test)

# Format results
answer = [{"id": tid, "class": int(pred)} for tid, pred in zip(test_ids, test_preds)]

print("\nFinal predictions:")
print(json.dumps(answer, indent=2))

post_answer({"solution": answer})