## Set up env variable

In [19]:
import os
from getpass import getpass
os.environ["NEPTUNE_API_TOKEN"] = getpass("Enter your API key here: ")

## Init Neptune variables and validate dataset

In [20]:
import neptune

run = neptune.init_run(
    name="Random Forest with preprocessing",
    tags=["RandomForest", "classification", 'preprocessing'],
    project="dagm.solska/ag-news-classification",
)

project = neptune.init_project(project="dagm.solska/ag-news-classification")

# Log the current dataset as an artifact
TRAIN_DATASET_PATH = "data_train.csv"
run["datasets/train"].track_files(TRAIN_DATASET_PATH, wait=True)
TEST_DATASET_PATH = "data_test.csv"
run["datasets/test"].track_files(TEST_DATASET_PATH, wait=True)

# Assert that the current dataset is the latest version
assert (
    run["datasets/train"].fetch_hash()
    == project["dataset_train/latest"].fetch_hash()
)
assert (
    run["datasets/test"].fetch_hash()
    == project["dataset_test/latest"].fetch_hash()
)

https://app.neptune.ai/dagm.solska/ag-news-classification/e/AG-14
https://app.neptune.ai/dagm.solska/ag-news-classification/


## Prepare data (preprocessing)

In [21]:
import pandas as pd

data_train = pd.read_csv("data_train.csv")
data_test = pd.read_csv("data_test.csv")
data_train['text'] = data_train['text'].map(lambda x: x.lower())
data_test['text'] = data_test['text'].map(lambda x: x.lower())

X_train, y_train = data_train['text'], data_train['label']
X_test, y_test = data_test['text'], data_test['label']

## Train new model with parameters based on data from Neptune and log results to Neptune

In [22]:
import neptune.integrations.sklearn as npt_utils
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=200, min_samples_split=5)),
        ],
)

text_clf.fit(X_train, y_train)

run["cls_summary"] = npt_utils.create_classifier_summary(
    text_clf, X_train, X_test, y_train, y_test
)



## Upload as the first production model

In [23]:
model = neptune.init_model(
        name="Initial model",
        project="dagm.solska/ag-news-classification", 
        key='AG'
)

https://app.neptune.ai/dagm.solska/ag-news-classification/m/AG-AG


In [24]:
# Initialize a ModelVersion object and assign the run ID and the URL of the run to it
model_version = neptune.init_model_version(
    model="AG-AG", 
    project='dagm.solska/ag-news-classification'
)

model_version["run/id"] = run["sys/id"].fetch()

model_version["run/url"] = run.get_url()

https://app.neptune.ai/dagm.solska/ag-news-classification/m/AG-AG/v/AG-AG-1


## Log notebook to the app and close Neptune objects

In [25]:
project["initial_model_with_preprocessing"].upload("ag_news_classification_preprocessing.ipynb")
model.stop()
run.stop()
project.stop()

Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/dagm.solska/ag-news-classification/m/AG-AG/metadata
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/dagm.solska/ag-news-classification/e/AG-14/metadata
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/dagm.solska/ag-news-classification/metadata
