# 1. Installing libraries

In [1]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/2.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m25.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-

In [2]:
!pip install pytest pytest-sugar

Collecting pytest-sugar
  Downloading pytest_sugar-0.9.7-py2.py3-none-any.whl (10 kB)
Installing collected packages: pytest-sugar
Successfully installed pytest-sugar-0.9.7


In [3]:
import wandb

In [8]:
# Login to Weights & Biases
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# 2. Pytest

## 2.1 Tests



In [9]:
#CREATE A FILE TO PERFORM THE TEST
%%file test_data.py
import pytest
import wandb
import pandas as pd

# This is global so all tests are collected under the same run
run = wandb.init(project="divorce_kmeans", job_type="data_checks")

@pytest.fixture(scope="session")
def data():

    local_path = run.use_artifact("divorce_kmeans/preprocessed_data.csv:latest").file()
    df = pd.read_csv(local_path)

    return df

def test_data_length(data):
    """
    We test that we have enough data to continue
    """
    assert len(data) > 149


def test_number_of_columns(data):
    """
    We test that we have enough data to continue
    """
    assert data.shape[1] == 55

def test_column_presence_and_type(data):
    """
    We test with each column has the right type
    """
    required_columns = {
        "Q1": pd.api.types.is_int64_dtype,
        "Q2": pd.api.types.is_int64_dtype,
        "Q3": pd.api.types.is_int64_dtype,
        "Q4": pd.api.types.is_int64_dtype,
        "Q5": pd.api.types.is_int64_dtype,
        "Q6": pd.api.types.is_int64_dtype,
        "Q7": pd.api.types.is_int64_dtype,
        "Q8": pd.api.types.is_int64_dtype,
        "Q10": pd.api.types.is_int64_dtype,
        "Q11": pd.api.types.is_int64_dtype,
        "Q12": pd.api.types.is_int64_dtype,
        "Q13": pd.api.types.is_int64_dtype,
        "Q14": pd.api.types.is_int64_dtype,
        "Q15": pd.api.types.is_int64_dtype,
        "Q16": pd.api.types.is_int64_dtype,
        "Q17": pd.api.types.is_int64_dtype,
        "Q18": pd.api.types.is_int64_dtype,
        "Q19": pd.api.types.is_int64_dtype,
        "Q20": pd.api.types.is_int64_dtype,
        "Q30": pd.api.types.is_int64_dtype,
        "Q31": pd.api.types.is_int64_dtype,
        "Q32": pd.api.types.is_int64_dtype,
        "Q33": pd.api.types.is_int64_dtype,
        "Q34": pd.api.types.is_int64_dtype,
        "Q35": pd.api.types.is_int64_dtype,
        "Q36": pd.api.types.is_int64_dtype,
        "Q37": pd.api.types.is_int64_dtype,
        "Q38": pd.api.types.is_int64_dtype,
        "Q39": pd.api.types.is_int64_dtype,
        "Q40": pd.api.types.is_int64_dtype,
        "Q41": pd.api.types.is_int64_dtype,
        "Q42": pd.api.types.is_int64_dtype,
        "Q43": pd.api.types.is_int64_dtype,
        "Q44": pd.api.types.is_int64_dtype,
        "Q45": pd.api.types.is_int64_dtype,
        "Q46": pd.api.types.is_int64_dtype,
        "Q47": pd.api.types.is_int64_dtype,
        "Q48": pd.api.types.is_int64_dtype,
        "Q49": pd.api.types.is_int64_dtype,
        "Q50": pd.api.types.is_int64_dtype,
        "Divorce": pd.api.types.is_int64_dtype
    }

    # Check column presence
    assert set(data.columns.values).issuperset(set(required_columns.keys()))

    for col_name, format_verification_funct in required_columns.items():

        assert format_verification_funct(data[col_name]), f"Column {col_name} failed test {format_verification_funct}"


def test_class_names(data):

    # Check that only the known classes are present
    known_classes = [0,1]

    assert data["Divorce"].isin(known_classes).all()


def test_column_ranges(data):

    ranges = {
        "Divorce": (0, 1)
    }

    for col_name, (minimum, maximum) in ranges.items():

        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}"
        )

Overwriting test_data.py


In [10]:
#RUN THE CREATED FILE TO PERFORM THE TEST
!pytest . -vv

[1mTest session starts (platform: linux, Python 3.10.12, pytest 7.4.3, pytest-sugar 0.9.7)[0m
cachedir: .pytest_cache
rootdir: /content
plugins: sugar-0.9.7, anyio-3.7.1
collected 5 items                                                                                  [0m

 [36mtest_data.py[0m::test_data_length[0m [32m✓[0m                                                     [32m20% [0m[40m[32m█[0m[40m[32m█        [0m
 [36mtest_data.py[0m::test_number_of_columns[0m [32m✓[0m                                               [32m40% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█      [0m
 [36mtest_data.py[0m::test_column_presence_and_type[0m [32m✓[0m                                        [32m60% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█    [0m
 [36mtest_data.py[0m::test_class_names[0m [32m✓[0m                                                     [32m80% [0m[40m[32m█[0m[40m[32m█[0m[40m[3

In [13]:


# Supondo que você já tenha calculado a matriz de correlação 'correl'
correl = df.corr()

# Define o tamanho do gráfico
plt.figure(figsize=(30, 20))

# Cria o heatmap com anotações
sns.heatmap(correl, annot=True)

# Mostra o gráfico
plt.show()

NameError: ignored