## Load and split the data, and save it for further use

In [51]:
from datasets import load_dataset
# Load a dataset
dataset = load_dataset("ag_news")

print(dataset["train"].features["label"].names)

# Convert the dataset to Pandas DataFrame
data_train = dataset['train'].to_pandas()
data_test = dataset['test'].to_pandas()

X_train, y_train = data_train['text'], data_train['label']
X_test, y_test = data_test['text'], data_test['label']

# Save data, e.g. locally
data_train.to_csv('data_train.csv')
data_test.to_csv('data_test.csv')

['World', 'Sports', 'Business', 'Sci/Tech']


## Prepare initial model

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Train simple classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier()),
        ],
)

text_clf.fit(X_train, y_train)

## Set environment variable

In [54]:
import os
from getpass import getpass
os.environ["NEPTUNE_API_TOKEN"] = getpass("Enter your API key here: ")


## Log initial model to the Neptune app

In [55]:
import neptune
import neptune.integrations.sklearn as npt_utils

# Create initial Neptune run and start logging using integration
run = neptune.init_run( 
    name="initial run",
    tags=["DecisionTreeClassifier", "classification"],
    project="dagm.solska/ag-news-classification"
)

run["cls_summary"] = npt_utils.create_classifier_summary(
    text_clf, X_train, X_test, y_train, y_test
)

# Stop logging to the active Neptune run
run.stop()

https://app.neptune.ai/dagm.solska/ag-news-classification/e/AG-8
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 64 operations to synchronize with Neptune. Do not kill this process.
All 64 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/dagm.solska/ag-news-classification/e/AG-8/metadata


## As this is initial notebook, upload notebook to the project and track dataset

In [56]:
project = neptune.init_project(project="dagm.solska/ag-news-classification")
project["initial_experiment"].upload("ag_news_classification_task.ipynb")

project["dataset_train/v0.1"].track_files("data_train.csv", wait=True)
project["dataset_test/v0.1"].track_files("data_test.csv", wait=True)

project["dataset_train/latest"] = project["dataset_train/v0.1"].fetch()
project["dataset_test/latest"] = project["dataset_test/v0.1"].fetch()

# Stop Neptune objects
project.stop()

https://app.neptune.ai/dagm.solska/ag-news-classification/
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.
All 2 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/dagm.solska/ag-news-classification/metadata
