# Construção de dataset com o sklearn
from: https://proclusacademy.com/blog/sklearn_make_classification/

In [1]:
from sklearn.datasets import make_classification
import pandas as pd 

In [2]:
X, y = make_classification(
    n_samples=1000, # 1000 observations 
    n_features=5, # 5 total features
    n_informative=3, # 3 'useful' features
    n_classes=2, # binary target/label 
    random_state=999 # if you want the same results as mine
)

In [3]:
# Create DataFrame with features as columns
dataset = pd.DataFrame(X)
# give custom names to the features
dataset.columns = ['X1', 'X2', 'X3', 'X4', 'X5']
# Now add the label as a column
dataset['y'] = y
 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      1000 non-null   float64
 1   X2      1000 non-null   float64
 2   X3      1000 non-null   float64
 3   X4      1000 non-null   float64
 4   X5      1000 non-null   float64
 5   y       1000 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 47.0 KB


In [4]:
dataset['y'].value_counts()

1    502
0    498
Name: y, dtype: int64

In [5]:
dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,y
0,2.501284,-0.159155,0.672438,3.469991,0.949268,0
1,2.203247,-0.331271,0.794319,3.259963,0.832451,0
2,-1.524573,-0.870737,1.004304,-1.028624,-0.717383,1
3,1.801498,3.106336,1.490633,-0.297404,-0.607484,0
4,-0.125146,0.987915,0.880293,-0.937299,-0.626822,0


In [8]:
# Salvando o dataset criado em csv 
dataset.to_csv('classification_df.csv', index=False)  

# Um exemplo de classificador

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
 
# initialize classifier
classifier = RandomForestClassifier() 
 
# Run cross validation with 10 folds
scores = cross_validate(
    classifier, X, y, cv=10, 
    # measure score for a list of classification metrics
    scoring=['accuracy', 'precision', 'recall', 'f1']
)
 
scores = pd.DataFrame(scores)
scores.mean().round(4)

fit_time          0.1549
score_time        0.0082
test_accuracy     0.8760
test_precision    0.8780
test_recall       0.8764
test_f1           0.8763
dtype: float64

- As métricas calculadas estão em torno de 88% o que não está ruim uma vez que não realizamos nenhum pré-processamento nos dados. 
- Os bons resultados também se devem ao fato de termos um dataset com classes balanceadas. Classes desbalanceadas podem ser mais difíceis de classificar, gerando métricas com valores não satisfatórios. 
- Mais exemplos podem ser vistos no site do início do arquivo.