##1.2 Install and load libraries

In [1]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.39.0-py2.py3-none-any.whl (254 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.0/254.0 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wa

In [2]:
import wandb
import pandas as pd
import numpy as np
import tempfile
import logging
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

##1.3 Preprocessing

###1.3.1 Download raw_data artifact from Wandb

In [3]:
# Login to Weights & Biases
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
input_artifact="the_simpsons_characters/raw_data.csv:latest"
artifact_name="preprocessed_data.csv"
artifact_type="clean_data"
artifact_description="Data after preprocessing"

###1.3.2 Setup wandb project and clean the dataset

In [5]:
# create a new job_type
run = wandb.init(project="the_simpsons_characters", job_type="process_data")

[34m[1mwandb[0m: Currently logged in as: [33mnatalia-simoes[0m ([33mflateam[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# donwload the latest version of artifact raw_data.csv
artifact = run.use_artifact('flateam/the_simpsons_characters/raw_data:v0', type='dataset')

# create a dataframe from the artifact
df = pd.read_csv(artifact.file())

In [8]:
# Delete duplicated rows
df.drop_duplicates(inplace=True)

# Generate a "clean data file"
df.to_csv(artifact_name,index=False)

In [9]:
df.dtypes

img path     object
character    object
dtype: object

In [10]:
# Resume the colums, the .T change colums to rows.
df.describe().T

Unnamed: 0,count,unique,top,freq
img path,20975,20975,/content/gdrive/My Drive/the_simpsons_cnn/data...,1
character,20975,43,homer_simpson,2246


In [11]:
for i in df.columns:
    print(i, ":", df[i].unique().tolist())

Output hidden; open in https://colab.research.google.com to view.

In [12]:
df.head()

Unnamed: 0,img path,character
0,/content/gdrive/My Drive/the_simpsons_cnn/data...,abraham_grampa_simpson
1,/content/gdrive/My Drive/the_simpsons_cnn/data...,abraham_grampa_simpson
2,/content/gdrive/My Drive/the_simpsons_cnn/data...,abraham_grampa_simpson
3,/content/gdrive/My Drive/the_simpsons_cnn/data...,abraham_grampa_simpson
4,/content/gdrive/My Drive/the_simpsons_cnn/data...,abraham_grampa_simpson


In [13]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference for a logging obj
logger = logging.getLogger()
with tempfile.TemporaryDirectory() as tmp_dir:
        temp_path = os.path.join(tmp_dir, artifact_name)
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(name=artifact_name,
                                  type=artifact_type,
                                  description="pre processed data",
        )

        artifact.add_file(temp_path)

        logger.info("Logging artifact")
        run.log_artifact(artifact)

        # This waits for the artifact to be uploaded to W&B. If you
        # do not add this, the temp directory might be removed before
        # W&B had a chance to upload the datasets, and the upload
        # might fail
        artifact.wait()

In [14]:
# Upload the artifact to Wandb
run.log_artifact(artifact)

<Artifact QXJ0aWZhY3Q6NjY1NjE3NjU5>

In [15]:
# close the run
# waiting a while after run the previous cell before execute this
run.finish()

VBox(children=(Label(value='2.408 MB of 2.417 MB uploaded\r'), FloatProgress(value=0.9963187574069814, max=1.0…