# Preparar para classificação

In [1]:
import numpy as np
import pandas as pd

In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = ['agrupar_dados']
product = None
not_classification_columns = []
feature_selection_percentage = 0.8 # colunas que vão restar após a selecao de feature
dimensionality_reduction_percentage = 0.1 # colunas que vão restar após a redução de dimensionalidade
dimensionality_reduction_algoritm = 'svd' # pca ou svd

In [None]:
df_reg = pd.read_parquet(upstream['agrupar_dados']['data'])
df_reg

## Seleção de features

In [None]:
classification_columns = df_reg.columns.difference(not_classification_columns)
df_X = df_reg[classification_columns]
df_y = df_reg['destaque']

In [433]:
category_columns = [col for col in df_reg.columns if col.startswith( 'category_')]
attributes_columns = [col for col in df_reg.columns if col.startswith( 'attribute_')]
hours_columns = [col for col in df_reg.columns if col.startswith( 'business_open_')]
geo_columns = ['latitude', 'longitude']
print("category lenght:", len(category_columns))
print("attributes lenght:", len(attributes_columns))
print("hours lenght:", len(hours_columns))
print("geo lenght:", len(geo_columns))

category lenght: 204
attributes lenght: 150
hours lenght: 28


In [435]:
from sklearn.feature_selection import SelectKBest, f_classif
import math

def select_best(columns):
    features_num = math.ceil(feature_selection_percentage * len(columns))
    selector = SelectKBest(f_classif, k=features_num)
    X_new = pd.DataFrame(selector.fit_transform(df_X[columns], df_y), 
                        columns=[list(df_X[columns].iloc[:, selector.get_support(indices=True)].columns)])
    X_new.columns = [''.join(col) for col in X_new.columns.values]
    return X_new

df_X_best_cat = select_best(category_columns)
df_X_best_attr = select_best(attributes_columns)
df_X_best_hours = select_best(hours_columns)
df_X_select = df_X.drop(category_columns + attributes_columns + hours_columns, axis=1)
df_X_select = pd.concat([df_X_select, df_X_best_attr, df_X_best_cat, df_X_best_hours], axis=1)
df_X_select

Unnamed: 0,latitude,longitude,page_rank,review_count,stars,score,is_page_rank_outlier,attribute_Caters_False,attribute_Caters_True,attribute_WiFi_free,...,business_open_Thursday_night,business_open_Saturday_morning,business_open_Saturday_afternoon,business_open_Saturday_night,business_open_Sunday_morning,business_open_Sunday_afternoon,business_open_Sunday_night,business_open_Wednesday_morning,business_open_Wednesday_afternoon,business_open_Wednesday_night
0,-0.686427,-0.425989,0.000087,1.190565,0.582949,0.532271,0.0,1.0,0.0,0.0,...,0.000000,0.000000,0.083333,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
1,-0.569901,-0.956974,0.000030,-0.400175,0.791667,0.577041,0.0,0.0,0.0,0.0,...,0.000000,0.500000,0.500000,0.000000,0.000000,0.0,0.0,0.500000,0.833333,0.000000
2,-0.697652,0.198579,0.000016,-0.151622,0.340136,0.706820,0.0,0.0,0.0,0.0,...,0.000000,0.333333,0.666667,0.000000,0.000000,0.0,0.0,0.833333,1.000000,0.000000
3,2.177144,-0.889070,0.000030,-0.400175,0.666667,0.400056,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
4,-0.312937,-1.101228,0.000008,-0.400175,0.006757,0.511156,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17577,2.264886,2.676289,0.000011,-0.184762,0.320359,0.567692,0.0,0.0,1.0,1.0,...,0.000000,0.166667,1.000000,0.000000,0.000000,1.0,0.0,0.166667,1.000000,0.000000
17578,-0.196596,0.115073,0.000030,-0.383605,0.772727,0.757226,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
17579,0.029843,0.907477,0.000047,-0.317324,0.568000,0.657578,0.0,0.0,0.0,0.0,...,0.000000,0.333333,1.000000,0.000000,0.000000,0.0,0.0,0.333333,1.000000,0.000000
17580,1.553877,2.925831,0.000018,-0.300754,0.146341,0.696177,0.0,0.0,1.0,0.0,...,0.166667,0.083333,1.000000,0.333333,0.083333,1.0,0.0,0.083333,1.000000,0.166667


## Redução de dimensionalidade

In [436]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

def svd_reduction(columns, features_number):
    svd = TruncatedSVD(n_components=features_number, n_iter=30, random_state=43)
    return svd.fit_transform(df_pca_input[columns])
    
def pca_reduction(columns, features_number):
    svd = PCA(n_components=features_number)
    return svd.fit_transform(df_pca_input[columns])

df_pca_input = df_X_select
if dimensionality_reduction_algoritm == 'pca':
    reduction_algorithm = pca_reduction
elif dimensionality_reduction_algoritm == 'svd':
    reduction_algorithm = svd_reduction
else:
    raise f'algoritmo de redução de dimensionalidade desconhecido: {dimensionality_reduction_algoritm}'

filtered_cat_cols = [col for col in df_pca_input.columns if col in category_columns]
filtered_attr_cols = [col for col in df_pca_input.columns if col in attributes_columns]
filtered_hours_cols = [col for col in df_pca_input.columns if col in hours_columns]

def reduce_dimensions(columns, columns_prefix): 
    features_number = math.ceil(len(columns) * dimensionality_reduction_percentage)
    X_new = pd.DataFrame(reduction_algorithm(columns, features_number), 
                                       columns=[columns_prefix + str(x) for x in range(features_number)])
    return X_new
    
if dimensionality_reduction_percentage < 1.0:
    df_X_pca_cat = reduce_dimensions(filtered_cat_cols, 'cat_pca_')
    df_X_pca_attr = reduce_dimensions(filtered_attr_cols, 'attr_pca_')
    df_X_pca_hours = reduce_dimensions(filtered_hours_cols, 'hours_pca_')
    df_X_pca_geo = reduce_dimensions(geo_columns, 'geolocalization_')
    
    df_X_pca = df_pca_input.drop(filtered_cat_cols + filtered_attr_cols + filtered_hours_cols + geo_columns, axis=1)
    df_X_pca = pd.concat([df_X_pca, df_X_pca_attr, df_X_pca_cat, df_X_pca_hours, df_X_pca_geo], axis=1)
else:
    df_X_pca = df_X

df_X_pca

Unnamed: 0,page_rank,review_count,stars,score,is_page_rank_outlier,attr_pca_0,attr_pca_1,cat_pca_0,cat_pca_1,hours_pca_0,geolocalization0
0,0.000087,1.190565,0.582949,0.532271,0.0,4.361661,-0.608353,0.256410,0.850164,0.371962,-0.786597
1,0.000030,-0.400175,0.791667,0.577041,0.0,1.560980,1.958592,0.217939,-0.128457,2.064800,-1.079663
2,0.000016,-0.151622,0.340136,0.706820,0.0,0.009820,0.068860,0.008773,-0.005035,2.733563,-0.352898
3,0.000030,-0.400175,0.666667,0.400056,0.0,0.016681,0.102161,0.935826,-0.282302,0.000000,0.910806
4,0.000008,-0.400175,0.006757,0.511156,0.0,0.028192,0.064674,0.218228,-0.124252,0.000000,-0.999966
...,...,...,...,...,...,...,...,...,...,...,...
17577,0.000011,-0.184762,0.320359,0.567692,0.0,4.380547,-1.348234,0.264833,0.868632,2.788913,3.493938
17578,0.000030,-0.383605,0.772727,0.757226,0.0,0.317827,0.080611,0.182972,0.508817,0.000000,-0.057645
17579,0.000047,-0.317324,0.568000,0.657578,0.0,1.530460,1.924575,0.234606,-0.134171,2.368747,0.662786
17580,0.000018,-0.300754,0.146341,0.696177,0.0,3.784400,-0.108334,0.269339,0.915859,2.824429,3.167632


In [None]:
df_X_pca.to_parquet(product['data_X'])
pd.DataFrame(df_y).to_parquet(product['data_y'])