In [40]:
import pandas as pd
import numpy as np

from collections import OrderedDict


import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, precision_score, accuracy_score
N_FOLDS = 5


In [41]:
df = pd.read_csv("../data/questionaries.csv")
df

Unnamed: 0,empresa,0,1,2,3,4,5,6,7,8,...,7441,7442,7443,7444,7445,7446,7447,7448,7449,7450
0,CONCESSIONÁRIA ECOVIAS DOS IMIGRANTES S.A._2015,,,,,,,0.0,1.0,0.0,...,,,,66.650000,1.0,1.0,1.0,1.0,0.0,1.0
1,CONCESSIONÁRIA ECOVIA CAMINHO DO MAR S.A._2015,,,,,,,0.0,1.0,0.0,...,,,,66.650000,1.0,1.0,1.0,1.0,0.0,1.0
2,TELEFÔNICA BRASIL S.A_2015,,,,,,,0.0,1.0,0.0,...,,,,28.530000,0.0,0.0,0.0,0.0,1.0,0.0
3,Companhia Paulista de Força e Luz_2015,,,,,,,0.0,1.0,0.0,...,,,,72.030000,0.0,0.0,1.0,1.0,0.0,1.0
4,Companhia Energética de Alagoas_2015,,,,,,,0.0,1.0,0.0,...,,,,222.222495,1.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,COSAN S.A._2020,,,,,,,,,,...,,,,21.790000,,,,,,
404,NEOENERGIA S.A._2020,,,,,,,,,,...,,,,-5.390000,,,,,,
405,CCR S.A._2020,,,,,,,,,,...,,,,-11.630000,,,,,,
406,TIM PARTICIPACOES S.A._2020,,,,,,,,,,...,,,,-19.980000,,,,,,


In [42]:
f = df.replace([0,'NA'], np.nan).apply(lambda x: any(~x.isnull()))
df = df.loc[:,f]

In [43]:
def labelize(num):
    if num < -15:
        return 0
    if num < -5:
        return 1
    if num < 5:
        return 2
    if num < 15:
        return 3
    return 4
    

In [44]:
df['7444'] = df['7444'].apply(labelize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [45]:
df['7444'].value_counts()

4    179
0     75
1     72
2     44
3     38
Name: 7444, dtype: int64

In [25]:
df.to_csv("questionaries_class.csv",index=False)

In [50]:
DEFAULT_LGB_PARAMS = {
    "max_bin": 512,
    "learning_rate": 0.05,
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "metric": "auc_mu",
    "num_leaves": 50,
    "verbose": -1,
    "min_data": 5,
    "boost_from_average": True,
    "random_state": 1
}

In [70]:
def eval_features(df, features, target, random_state):
    """
    5-fold cross validation with LGBMClassifier
    Used to collect results for SHAP analysis
    """
    stats = OrderedDict()
    
    X = df[features].values
    y = df[target].values.ravel()

    a = [] #array to store AUCs values for each fold
    b = [] #array to store PPVs values for each fold
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state)
    max_auc = 0
    iter_max_auc = 0
    for i, (train, val) in enumerate(cv.split(X, y)):
        classifier = lgb.LGBMClassifier(**DEFAULT_LGB_PARAMS)
        classifier = classifier.fit(X[train], y[train])
        
        probas_ = classifier.predict_proba(X[val])
        #print(probas_)
        auc = roc_auc_score(y[val], probas_,multi_class='ovr')
        pred_test = classifier.predict(X[val]) #making predictions for test data
        pred_train = classifier.predict(X[train]) #making predictions for train data
        ppv = precision_score(y[val], pred_test,average='macro') #PPV is also the precision of the positive class
        a.append(auc)
        b.append(ppv)
        
        if auc > max_auc:
            max_auc = auc
            iter_max_auc = i
        
        
        stats[i] = {}
        stats[i]['model'] = classifier
        stats[i]['auc'] = auc
        stats[i]['X_train'] = X[train]
        stats[i]['y_train'] = y[train]
        stats[i]['y_pred_train'] = pred_train
        stats[i]['X_test'] = X[val]
        stats[i]['y_test'] = y[val]
        stats[i]['y_pred_test'] = pred_test
        
    stats['mean_auc'] = np.mean(a)
    stats['max_auc'] = np.max(a)
    stats['iter_max_auc'] = iter_max_auc
    stats['mean_ppv'] = np.mean(b)
    return stats, classifier


In [73]:
LABEL_COLUMN_NAME = '7444'
UNWANTED_COLUMNS = ['empresa']
all_features = list(df.columns)
features = [f for f in all_features  if f not in UNWANTED_COLUMNS]
features = features[0:5]

In [74]:
stats, classifier = eval_features(df,features,[LABEL_COLUMN_NAME],1)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
