In [1]:
import numpy as np
import pandas as pd

import scipy as sp
import scipy.stats as stats

import arviz as az
import pymc as pm

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

## Data

In [3]:
import zipfile, glob
from pathlib import Path
from urllib.request import urlretrieve
from urllib.parse import urljoin

In [4]:
url = 'https://archive.ics.uci.edu/static/public/17/breast+cancer+wisconsin+diagnostic.zip'
data_path = Path('data')
filename = data_path / url.split('/')[-1]

if not data_path.exists():
    print('Creating directory')
    data_path.mkdir()
if not filename.exists():
    print('Downloading...', url)
    urlretrieve(url, filename)

if filename.exists():
    extract_dir = 'data'
    print('Extracting files...')
    with zipfile.ZipFile(filename) as zf:
        zf.extractall(extract_dir)
    for file in glob.glob(extract_dir + '/**', recursive=True):
        print(file)

print('...DONE')

Creating directory
Downloading... https://archive.ics.uci.edu/static/public/17/breast+cancer+wisconsin+diagnostic.zip
Extracting files...
data/
data/wdbc.data
data/breast+cancer+wisconsin+diagnostic.zip
data/wdbc.names
...DONE


## EDA

In [10]:
tits = pd.read_csv('data/wdbc.data', header=None)
tits.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
