# Copernicus Test: Tabular
In this notebook we will experiment with several datasets which contain: Continuous only, Categorical only, and both. This will allow for us to make our Tabular script more robust to dynamic variable settings. 

All the files we will use exist in ```./data```. To download all the files, simply go to ```copernicus_datasets``` notebook which shows step by step on how to download each file. Or simply run the cells. 

In [1]:
from exp.nb_copernicusTabular import *
import pandas as pd
import os
from path import Path

In [2]:
# root directory to all data
root = Path('./data')
root.listdir()

[Path('./data\\car_evaluation.csv'),
 Path('./data\\clinvar'),
 Path('./data\\eeg'),
 Path('./data\\ibm_attrition'),
 Path('./data\\kepler'),
 Path('./data\\kepler_exoplanet'),
 Path('./data\\lending_club'),
 Path('./data\\lol'),
 Path('./data\\MNIST'),
 Path('./data\\mobile_pricce'),
 Path('./data\\mushroom'),
 Path('./data\\ptb'),
 Path('./data\\pulsar'),
 Path('./data\\StudentsPerformance.csv'),
 Path('./data\\telco'),
 Path('./data\\wids'),
 Path('./data\\wine_quality')]

## EEG
dataset: https://www.kaggle.com/wanghaohan/confused-eeg

In [3]:
### 1. ###
# uploading our data
f = (root/'eeg').listdir()[0]
data = pd.read_csv(f)

In [4]:
### 2. ###
# dropping column: Our UI will direct this 
# data.drop(columns='predefinedlabel', inplace=True)
dependent = 'user-definedlabeln'

# setting our tabularbunch object using Auto
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [5]:
### 3. ###
# Creating our data object: The core to our Learner Object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [6]:
data.train_dl.dataset.y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## IBM Attrition
dataset: https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset

In [7]:
### 1. ###
# uploading data
f = (root/'ibm_attrition').listdir()[0]
data = pd.read_csv(f)

In [8]:
### 2. ###
# setting our tabularbunch using Auto
dependent = 'Attrition'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [9]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [10]:
data.train_dl.dataset.y

array([1, 0, 1, ..., 1, 0, 0])

## Kepler
dataset: https://www.kaggle.com/keplersmachines/kepler-labelled-time-series-data

In [11]:
### 1. ###
# Grabbing datasets | We will concat both as they have matching columns
f1 = (root/'kepler').listdir()[0]
f2 = (root/'kepler').listdir()[1]

df1 = pd.read_csv(f1)
df2 = pd.read_csv(f2)

data = pd.concat([df1, df2], axis=0)

In [12]:
### 2. ###
# creating our tabular bunch
dependent = 'LABEL'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [13]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [14]:
data.train_dl.dataset.y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Kepler ExoPlanet
dataset: https://www.kaggle.com/nasa/kepler-exoplanet-search-results

In [15]:
### 1. ###
# Loading our data
f = (root/'kepler_exoplanet').listdir()[0]
data = pd.read_csv(f)

In [16]:
### 2a. ###
# Creating our tabular bunch
# We delcare this is a regression problem_type
# This dataset contains some NaN values in our dependent column
# Which should throw an error. To fix this we will use purge_='aggressive'
# This will keep a dataset of all rows without that missing value for its dependent var
dependent = 'koi_score'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, problem_type='regression', verbose=False)

ValueError: koi_score column seems to have some missing values or NaN values. This will cause an error when training the model. Suggestion: Use aggressive purge_ type or pre-process the data to have no-nan values on this specific column

In [17]:
### 2b. ###
# We will now fix this by setting our purge_='aggressive'
# Another way to fix this would be to pre-process the data and remove those null values 
# ourselves, but this would make for bad UX 
dependent = 'koi_score'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, problem_type='regression', purge_='aggressive', verbose=False)

In [18]:
### 3. ###
# now we can create our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [19]:
data.train_dl.dataset.y

array([0.   , 0.   , 1.   , ..., 0.996, 0.008, 0.   ], dtype=float32)

## League of Legends

In [20]:
### 1. ###
# Loading our data
f = (root/'lol').listdir()[0]
data = pd.read_csv(f)

In [21]:
### 2. ###
# Creating our tabularbunch
dependent = 'winner'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, problem_type='classification', verbose=False)

In [22]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [23]:
data.train_dl.dataset.y

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [24]:
data.tabularbunch.X_state

'contonly'

## Mobile Price

In [25]:
### 1. ###
# Loading our data
f1 = (root/'mobile_pricce').listdir()[0]
f2 = (root/'mobile_pricce').listdir()[1]

In [26]:
# f2 contains the labels
data = pd.read_csv(f2)

In [27]:
### 2. ###
# creating our tabularbunch object
dependent = 'price_range'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [28]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [29]:
data.tabularbunch.X_state

'contonly'

In [30]:
data.tabularbunch.problem_type

'classification'

## Mushroom

In [31]:
### 1. ###
# Grabbing our data
f = (root/'mushroom').listdir()[0]
data = pd.read_csv(f)

In [32]:
### 2. ###
# Creating our tabularbunch object
dependent='class'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [33]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

## Pulsar

In [42]:
### 1. ###
# Grabbing data
f = (root/'pulsar').listdir()[0]
data = pd.read_csv(f)

In [44]:
### 2. ###
# Creating tabularbunch object
dependent = 'target_class'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [46]:
### 3. ###
# Creating data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [47]:
data.tabularbunch.X_state

'contonly'

In [49]:
data.train_dl.dataset.y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Telco Churn Prediction

In [54]:
### 1. ###
# Grabbing data
f = (root/'telco').listdir()[0]
data = pd.read_csv(f)

In [56]:
### 2. ### 
# Creating tabularbunch object
dependent = 'Churn'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [57]:
### 3. ###
# Creating our data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [58]:
data.tabularbunch.X_state

'both'

## WiDs

In [87]:
### 1. ###
# Grabbing our data
f = (root/'wids').listdir()[2]
data = pd.read_csv(f, low_memory=False)

In [70]:
### 2. ###
# Creating tabularbunch object
dependent = 'is_female'
tabularbunch = AutoTabularBunch(df=data, dependent=dependent, verbose=False)

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18255 entries, 0 to 18254
Columns: 1235 entries, train_id to GN5_OTHERS
dtypes: float64(900), int64(239), object(96)
memory usage: 172.0+ MB


In [75]:
tabularbunch.X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18255 entries, 0 to 18254
Columns: 261 entries, train_id to GN5
dtypes: float64(23), int64(238)
memory usage: 36.4 MB


In [84]:
### 3. ###
# Creating data object
data = TabularData(tabularbunch, bs=32, num_workers=1)

In [86]:
data.tabularbunch.X_state

'contonly'