## Prelimiaries

For a Bayesian classification example for illustrating the **sheets/cards** family.

In [1]:
!rm -rf *.sh

<br>

### Packages

In [2]:
import subprocess

In [3]:
if 'google.colab' in str(get_ipython()):
    subprocess.run('wget -q https://raw.githubusercontent.com/exhypotheses/beans/develop/scripts.sh', shell=True)
    subprocess.run('chmod u+x scripts.sh', shell=True)
    subprocess.run('./scripts.sh', shell=True)

<br>

### Paths

In [4]:
import os
import pathlib
import sys

In [5]:
if not 'google.colab' in str(get_ipython()):    
    notebooks = os.getcwd()
    parent = str(pathlib.Path(notebooks).parent)
    sys.path.append(parent)
else:
    notebooks = os.getcwd()
    parent = notebooks    

<br>

Hence

In [6]:
warehouse_directory = os.path.join(parent, 'warehouse', 'data')
if not os.path.exists(warehouse_directory):
    os.makedirs(warehouse_directory)
    
raw_data_directory = os.path.join(parent, 'data')
if not os.path.exists(raw_data_directory):
    os.makedirs(raw_data_directory)    

<br>

### Libraries

In [7]:
import logging
import collections

import pandas as pd
import numpy as np

import json

import sklearn.preprocessing

<br>

### Logging

In [8]:
logging.basicConfig(level=logging.INFO, format='%(message)s\n%(asctime)s.%(msecs)03d', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

<br>

### Custom

In [9]:
import beans.src.archetype

<br>
<br>

## Data

### Raw

Set-up

* `https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip`

* `https://github.com/miscellane/hub/raw/develop/data/beans/beans.zip`

In [10]:
url = 'https://raw.githubusercontent.com/miscellane/hub/develop/data/beans/beans.csv'

usecols = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter',
            'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']

dtype = {'Area': np.int, 'Perimeter': np.float, 'MajorAxisLength': np.float, 'MinorAxisLength': np.float, 'AspectRation': np.float,
            'Eccentricity': np.float, 'ConvexArea': np.int, 'EquivDiameter': np.float, 'Extent': np.float, 'Solidity': np.float,
            'roundness': np.float, 'Compactness': np.float, 'ShapeFactor1': np.float, 'ShapeFactor2': np.float, 'ShapeFactor3': np.float,
            'ShapeFactor4': np.float, 'Class': str}

<br>

Read

In [11]:
try:
    data = pd.read_csv(filepath_or_buffer=url, header=0, usecols=usecols, dtype=dtype, encoding='utf-8')
except OSError as err:
    raise Exception(err.strerror) in err

logger.info(data.info())

None
2021-06-04 07:33:56.846


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  int32  
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRation     13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  int32  
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
 16  Class            13611 non-null  object 
dtypes: float64(1

<br>

Write

In [12]:
data.to_csv(path_or_buf=os.path.join(raw_data_directory, 'beans.csv'), 
            header=True, index=False, encoding='utf-8')

<br>
<br>

## Features

### Renaming

In [13]:
data.rename(str.lower, axis=1, inplace=True)
data.rename(columns={'aspectration': 'aspectratio'}, inplace=True)
logger.info(data.info())

None
2021-06-04 07:33:57.298


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area             13611 non-null  int32  
 1   perimeter        13611 non-null  float64
 2   majoraxislength  13611 non-null  float64
 3   minoraxislength  13611 non-null  float64
 4   aspectratio      13611 non-null  float64
 5   eccentricity     13611 non-null  float64
 6   convexarea       13611 non-null  int32  
 7   equivdiameter    13611 non-null  float64
 8   extent           13611 non-null  float64
 9   solidity         13611 non-null  float64
 10  roundness        13611 non-null  float64
 11  compactness      13611 non-null  float64
 12  shapefactor1     13611 non-null  float64
 13  shapefactor2     13611 non-null  float64
 14  shapefactor3     13611 non-null  float64
 15  shapefactor4     13611 non-null  float64
 16  class            13611 non-null  object 
dtypes: float64(1

<br>

### Write

For modelling

In [14]:
data.to_csv(path_or_buf=os.path.join(warehouse_directory, 'baseline.csv'), 
            header=True, index=False, encoding='utf-8')

<br>

### Fields Help

In [15]:
target = 'class'

In [16]:
numeric = data.drop(columns=target).select_dtypes(exclude=object).columns.to_list()

In [17]:
dictionary = []

dictionary.append({'source': 'baseline.csv', 'numeric': numeric, 'categorical': [], 'target': target})

In [18]:
with open(os.path.join(warehouse_directory, 'fields.json'), 'w') as disk:
    json.dump(dictionary, disk)