In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
from plotly import figure_factory
pio.templates.default = "ggplot2"
PRIMARY_COLOR = '#EB89B5'
SECONDARY_COLOR = '#330C73'

## Lendo dados e definindo target numérico

In [6]:
def read_data(path: str, sep=','):
    data = pd.read_csv(path, sep=sep)
    # removendo espaços dos nomes das colunas e colocando tudo minusculo
    data.columns = [col.lower() for col in data.columns]
    data.columns = ['_'.join(col.split()) for col in data.columns]
    data['class'] = np.where(data['class'] == 'Positive', 1, 0)
    return data

### Novas features

In [7]:
data = read_data('../data/raw/diabetes_data_upload.csv')

### Proporção dos principais fatores

In [None]:
has_polyuria = np.where(data.polyuria == 'Yes', 1, 0)
has_polydipsia = np.where(data.polydipsia == 'Yes', 1, 0)
has_polyphagia = np.where(data.polyphagia == 'Yes', 1, 0)
has_sudden_weight_loss = np.where(data.sudden_weight_loss == 'Yes', 1, 0)  
has_partial_paresis = np.where(data.partial_paresis == 'Yes', 1, 0)
has_polyphagia  = np.where(data.polyphagia == 'Yes', 1, 0)

data['prop_main_symptoms_6'] = (has_polyuria + has_polydipsia + has_polyphagia + has_sudden_weight_loss + has_partial_paresis + has_polyphagia)/6
data['prop_main_symptoms_3'] = (has_polyuria + has_polydipsia + has_sudden_weight_loss)/3

### Genero feminino + idade 
Observando o genero feminino parece ter dois picos de diabetes, um dos 31-40 anos, que pode estar vinculado a diabétes gestacional e outro dos 46-55 anos.


In [None]:
has_age_31_40 = (data.age >= 31) & (data.age <= 40)
has_age_46_55 = (data.age >= 46) & (data.age <= 55)
has_age = has_age_31_40 | has_age_46_55
is_female = data.gender == 'Female'
data['is_female_with_peak_age'] = np.where(is_female & has_age, 1, 0)

### Classe de aplicação


In [9]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    """Adds new features to dataset
    """
    def __init__(self):
        ...
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        _X = X.copy()
        _X = self._main_symptoms(_X)
        _X = self._female_peak_age(_X)
        return _X
    
    def _main_symptoms(self, data):
        has_polyuria = np.where(data.polyuria == 'Yes', 1, 0)
        has_polydipsia = np.where(data.polydipsia == 'Yes', 1, 0)
        has_polyphagia = np.where(data.polyphagia == 'Yes', 1, 0)
        has_sudden_weight_loss = np.where(data.sudden_weight_loss == 'Yes', 1, 0)  
        has_partial_paresis = np.where(data.partial_paresis == 'Yes', 1, 0)
        has_polyphagia  = np.where(data.polyphagia == 'Yes', 1, 0)
        data['prop_main_symptoms_6'] = (has_polyuria + has_polydipsia + has_polyphagia + has_sudden_weight_loss + has_partial_paresis + has_polyphagia)/6
        data['prop_main_symptoms_3'] = (has_polyuria + has_polydipsia + has_sudden_weight_loss)/3
        return data
    
    def _female_peak_age(self, data):
        has_age_31_40 = (data.age >= 31) & (data.age <= 40)
        has_age_46_55 = (data.age >= 46) & (data.age <= 55)
        has_age = has_age_31_40 | has_age_46_55
        is_female = data.gender == 'Female'
        data['is_female_with_peak_age'] = np.where(is_female & has_age, 1, 0)        
        return data

In [15]:
class CategoricalConverter(BaseEstimator, TransformerMixin):
    """Convert categorical to numerical representation
    """
    def __init__(self):
        self.columns = []
        self.categ_mapping = []
        
    def fit(self, X, y=None):
        self.columns = [col for col in X.columns]
        self.categ_mapping = self._col_mapping(X)
        return self
    
    def transform(self, X):
        _X = X.copy()
        for col in self.columns:
            _X[col] = pd.Categorical(_X[col], categories=self.categ_mapping[col]).codes
        return _X
    
    def _col_mapping(self, data):
        columns = data.columns
        col_mapping = {}
        for col in columns:
            col_mapping[col] = list(np.unique(data[col]))
        return col_mapping

In [19]:
data

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,1
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,1
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,1
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,1
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,1
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,1
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,1
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,0


In [18]:
CategoricalConverter().fit(data).transform(data)

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,16,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,34,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,17,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,21,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,36,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,15,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,24,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,34,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,8,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0
