In [1]:
import aidds.sys.config as cfg
from aidds.sys.utils.data_io import read_data
from aidds.sys.utils.evaluation import regression_evals
from aidds.sys.utils.exception import AiddsException
from sklearn.model_selection import train_test_split

In [2]:
ppdf = read_data(file_code='data.pp.last')

In [3]:
ppdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13416 entries, 0 to 13415
Data columns (total 100 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   acc_no              13416 non-null  object 
 1   cons_cost           13416 non-null  int64  
 2   cont_cap            13416 non-null  int64  
 3   sup_type            13416 non-null  int64  
 4   office_id           13416 non-null  int64  
 5   pole_cnt            13416 non-null  float64
 6   line_cnt            13416 non-null  float64
 7   sl_cnt              13416 non-null  float64
 8   pole_shape_G        13416 non-null  int64  
 9   pole_shape_O        13416 non-null  int64  
 10  pole_shape_V        13416 non-null  int64  
 11  pole_type_0         13416 non-null  int64  
 12  pole_type_1         13416 non-null  int64  
 13  pole_type_B         13416 non-null  int64  
 14  pole_type_C         13416 non-null  int64  
 15  pole_type_E         13416 non-null  int64  
 16  pol

In [4]:
target_col = cfg.col.target
modeling_cols = ppdf.columns[2:].tolist()

In [5]:
modeling_cols[:3]

['cont_cap', 'sup_type', 'office_id']

In [6]:
check_df = ppdf[['acc_no', 'cons_cost']].copy()
X = ppdf[modeling_cols].copy()
y = ppdf[target_col].copy()
type(check_df), type(X), type(y)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series)

In [7]:
train_X, test_X, train_y, test_y = \
    train_test_split(X, y, test_size=0.25)

In [8]:
train_X.shape, test_X.shape

((10062, 98), (3354, 98))

In [9]:
try:
    for id in cfg.model.ids:
        print(f'MODEL: {id.upper()}')
        model = eval(f'cfg.model.ml.{id}')
        model.fit(train_X, train_y)
        pred_y = model.predict(test_X)
        _, message = regression_evals(y=test_y.to_numpy(), p=pred_y)
        print(message)
except AiddsException as ae:
    ae.print()
except Exception as e:
    print(e)

MODEL: LIN
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.198996(19.8996), R2SCORE: 0.663678
MODEL: LASSO


  model = cd_fast.enet_coordinate_descent(


++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.199000(19.9000), R2SCORE: 0.663577
MODEL: RIDGE
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.198802(19.8802), R2SCORE: 0.665605
MODEL: KNR
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.256227(25.6227), R2SCORE: 0.446347
MODEL: DTR
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.280103(28.0103), R2SCORE: 0.295596
MODEL: RFR
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.214190(21.4190), R2SCORE: 0.650770
MODEL: GBR
++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.nd

In [10]:
yy = test_y.to_numpy()

In [11]:
# GBR
model = cfg.model.ml.gbr
model.fit(train_X, train_y)
pred_y = model.predict(test_X)
_, message = regression_evals(y=yy, p=pred_y)
print(message)

++++ data_size = 3354
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(3354,), p-(3354,)
MAPE: 0.192298(19.2298), R2SCORE: 0.678604


In [12]:
pred_all_y = model.predict(X)
y_all = y.to_numpy()
_, message = regression_evals(y=y_all, p=pred_all_y)
print(message)

++++ data_size = 13416
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(13416,), p-(13416,)
MAPE: 0.185906(18.5906), R2SCORE: 0.740145


In [13]:
check_df['pred'] = pred_all_y

In [18]:
def calculate_mape(y, p):
    return abs((y-p)/y) * 100

check_df['mape'] = calculate_mape(check_df.cons_cost, check_df.pred)
check_df['best'] = check_df['mape'].apply(lambda x: '*' if x<=30 else '')

In [19]:
check_df.best.value_counts()

best
*    10938
      2478
Name: count, dtype: int64

In [20]:
best_df = check_df[check_df.best=='*'].copy()

In [22]:
_, message = regression_evals(
    y=best_df.cons_cost.to_numpy(), 
    p=best_df.pred.to_numpy()
)
print(message)

++++ data_size = 10938
++++ data_type: y-<class 'numpy.ndarray'>, p-<class 'numpy.ndarray'>
++++ data_shape: y-(10938,), p-(10938,)
MAPE: 0.109029(10.9029), R2SCORE: 0.940731


In [23]:
best_df

Unnamed: 0,acc_no,cons_cost,pred,mape,best
0,477420193243,11598900,1.213587e+07,4.629491,*
1,477420193827,5362339,5.894058e+06,9.915810,*
3,477420203272,3132017,4.021677e+06,28.405350,*
4,477420203306,16941740,1.542726e+07,8.939367,*
5,477420203444,2585476,3.182896e+06,23.106789,*
...,...,...,...,...,...
13411,474620234559,2383062,2.436585e+06,2.245989,*
13412,474620234576,2030411,2.560096e+06,26.087558,*
13413,474620234577,2425666,2.985935e+06,23.097517,*
13414,474620234622,2106904,2.555056e+06,21.270627,*
