forked from pycaret/pycaret
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
2760 lines (2301 loc) · 124 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Module: Preprocess
# Author: Fahad Akbar <m.akbar@queensu.ca>
# License: MIT
import pandas as pd
import numpy as np
import ipywidgets as wg
from IPython.display import display
from ipywidgets import Layout
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import KBinsDiscretizer
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.pca import PCA as PCA_od
from sklearn import cluster
from scipy import stats
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import RandomForestRegressor as rfr
from lightgbm import LGBMClassifier as lgbmc
from lightgbm import LGBMRegressor as lgbmr
import sys
from sklearn.pipeline import Pipeline
from sklearn import metrics
import datefinder
from datetime import datetime
import calendar
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
#_____________________________________________________________________________________________________________________________
class DataTypes_Auto_infer(BaseEstimator,TransformerMixin):
'''
- This will try to infer data types automatically, option to override learent data types is also available.
- This alos automatically delets duplicate columns (values or same colume name), removes rows where target variable is null and
remove columns and rows where all the records are null
'''
def __init__(self,target,ml_usecase,categorical_features=[],numerical_features=[],time_features=[],features_todrop=[],display_types=True): # nothing to define
'''
User to define the target (y) variable
args:
target: string, name of the target variable
ml_usecase: string , 'regresson' or 'classification . For now, only supports two class classification
- this is useful in case target variable is an object / string . it will replace the strings with integers
categorical_features: list of categorical features, default None, when None best guess will be used to identify categorical features
numerical_features: list of numerical features, default None, when None best guess will be used to identify numerical features
time_features: list of date/time features, default None, when None best guess will be used to identify date/time features
'''
self.target = target
self.ml_usecase= ml_usecase
self.categorical_features =categorical_features
self.numerical_features = numerical_features
self.time_features =time_features
self.features_todrop = features_todrop
self.display_types = display_types
def fit(self,dataset,y=None): # learning data types of all the columns
'''
Args:
data: accepts a pandas data frame
Returns:
Panda Data Frame
'''
data = dataset.copy()
# remove sepcial char from column names
#data.columns= data.columns.str.replace('[,]','')
# we will take float as numberic, object as categorical from the begning
# fir int64, we will check to see what is the proportion of unique counts to the total lenght of the data
# if proportion is lower, then it is probabaly categorical
# however, proportion can be lower / disturebed due to samller denominator (total lenghth / number of samples)
# so we will take the following chart
# 0-50 samples, threshold is 24%
# 50-100 samples, th is 12%
# 50-250 samples , th is 4.8%
# 250-500 samples, th is 2.4%
# 500 and above 2% or belwo
# # if there are inf or -inf then replace them with NaN
# data.replace([np.inf,-np.inf],np.NaN,inplace=True)
# we canc check if somehow everything is object, we can try converting them in float
for i in data.select_dtypes(include=['object']).columns:
try:
data[i] = data[i].astype('int64')
except:
None
# if data type is bool , convert to categorical
for i in data.columns:
if data[i].dtype=='bool':
data[i] = data[i].astype('object')
# some times we have id column in the data set, we will try to find it and then will drop it if found
len_samples = len(data)
self.id_columns = []
for i in data.drop(self.target,axis=1).columns:
if data[i].dtype in ['int64','float64']:
if i not in self.numerical_features:
if sum(data[i].isna()) == 0:
if len(data[i].unique()) == len_samples:
min_number = min(data[i])
max_number = max(data[i])
arr = np.arange(min_number,max_number+1,1)
try:
all_match = sum(data[i].sort_values() == arr)
if all_match == len_samples:
self.id_columns.append(i)
except:
None
data_len = len(data)
# wiith csv , if we have any null in a colum that was int , panda will read it as float.
# so first we need to convert any such floats that have NaN and unique values are lower than 20
for i in data.drop(self.target,axis=1).columns:
if data[i].dtypes == 'float64':
# count how many Nas are there
na_count = sum(data[i].isna())
# count how many digits are there that have decimiles
count_float = np.nansum([ False if r.is_integer() else True for r in data[i]])
# total decimiels digits
count_float = count_float - na_count # reducing it because we know NaN is counted as a float digit
# now if there isnt any float digit , & unique levales are less than 20 and there are Na's then convert it to object
if ( (count_float == 0) & (len(data[i].unique()) <=20) & (na_count>0) ):
data[i] = data[i].astype('object')
# should really be an absolute number say 20
# length = len(data.iloc[:,0])
# if length in range(0,51):
# th=.25
# elif length in range(51,101):
# th=.12
# elif length in range(101,251):
# th=.048
# elif length in range(251,501):
# th=.024
# elif length > 500:
# th=.02
# if column is int and unique counts are more than two, then: (exclude target)
for i in data.drop(self.target,axis=1).columns:
if data[i].dtypes == 'int64': #((data[i].dtypes == 'int64') & (len(data[i].unique())>2))
if len(data[i].unique()) <=20: #hard coded
data[i]= data[i].apply(str)
else:
data[i]= data[i].astype('float64')
# # if colum is objfloat and only have two unique counts , this is probabaly one hot encoded
# # make it object
for i in data.columns:
if ((data[i].dtypes == 'float64') & (len(data[i].unique())==2)):
data[i]= data[i].apply(str)
#for time & dates
#self.drop_time = [] # for now we are deleting time columns
for i in data.drop(self.target,axis=1).columns:
# we are going to check every first row of every column and see if it is a date
match = datefinder.find_dates(data[i].values[0]) # to get the first value
try:
for m in match:
if isinstance(m, datetime) == True:
data[i] = pd.to_datetime(data[i])
#self.drop_time.append(i) # for now we are deleting time columns
except:
continue
# now in case we were given any specific columns dtypes in advance , we will over ride theos
if len(self.categorical_features) > 0:
for i in self.categorical_features:
try:
data[i]=data[i].apply(str)
except:
data[i]=dataset[i].apply(str)
if len(self.numerical_features) > 0:
for i in self.numerical_features:
try:
data[i]=data[i].astype('float64')
except:
data[i]=dataset[i].astype('float64')
if len(self.time_features) > 0:
for i in self.time_features:
try:
data[i]=pd.to_datetime(data[i])
except:
data[i]=pd.to_datetime(dataset[i])
# table of learent types
self.learent_dtypes = data.dtypes
#self.training_columns = data.drop(self.target,axis=1).columns
# if there are inf or -inf then replace them with NaN
data.replace([np.inf,-np.inf],np.NaN,inplace=True)
# lets remove duplicates
# remove duplicate columns (columns with same values)
#(too expensive on bigger data sets)
# data_c = data.T.drop_duplicates()
# data = data_c.T
#remove columns with duplicate name
data = data.loc[:,~data.columns.duplicated()]
# Remove NAs
data.dropna(axis=0, how='all', inplace=True)
data.dropna(axis=1, how='all', inplace=True)
# remove the row if target column has NA
data = data[~data[self.target].isnull()]
#self.training_columns = data.drop(self.target,axis=1).columns
# since due to transpose , all data types have changed, lets change the dtypes to original---- not required any more since not transposing any more
# for i in data.columns: # we are taking all the columns in test , so we dot have to worry about droping target column
# data[i] = data[i].astype(self.learent_dtypes[self.learent_dtypes.index==i])
if self.display_types == True:
display(wg.Text(value="Following data types have been inferred automatically, if they are correct press enter to continue or type 'quit' otherwise.",layout =Layout(width='100%')),display_id='m1')
dt_print_out = pd.DataFrame(self.learent_dtypes, columns=['Feature_Type'])
dt_print_out['Data Type'] = ""
for i in dt_print_out.index:
if i != self.target:
if dt_print_out.loc[i,'Feature_Type'] == 'object':
dt_print_out.loc[i,'Data Type'] = 'Categorical'
elif dt_print_out.loc[i,'Feature_Type'] == 'float64':
dt_print_out.loc[i,'Data Type'] = 'Numeric'
elif dt_print_out.loc[i,'Feature_Type'] == 'datetime64[ns]':
dt_print_out.loc[i,'Data Type'] = 'Date'
#elif dt_print_out.loc[i,'Feature_Type'] == 'int64':
# dt_print_out.loc[i,'Data Type'] = 'Categorical'
else:
dt_print_out.loc[i,'Data Type'] = 'Label'
# for ID column:
for i in dt_print_out.index:
if i in self.id_columns:
dt_print_out.loc[i,'Data Type'] = 'ID Column'
# if we added the dummy target column , then drop it
dt_print_out.drop(index='dummy_target',errors='ignore',inplace=True)
# drop any columns that were asked to drop
dt_print_out.drop(index=self.features_todrop,errors='ignore',inplace=True)
display(dt_print_out[['Data Type']])
self.response = input()
if self.response in ['quit','Quit','exit','EXIT','q','Q','e','E','QUIT','Exit']:
sys.exit('Read the documentation of setup to learn how to overwrite data types over the inferred types. setup function must run again before you continue modeling.')
# drop time columns
#data.drop(self.drop_time,axis=1,errors='ignore',inplace=True)
# drop id columns
data.drop(self.id_columns,axis=1,errors='ignore',inplace=True)
return(data)
def transform(self,dataset,y=None):
'''
Args:
data: accepts a pandas data frame
Returns:
Panda Data Frame
'''
data = dataset.copy()
# remove sepcial char from column names
#data.columns= data.columns.str.replace('[,]','')
#very first thing we need to so is to check if the training and test data hace same columns
#exception checking
import sys
for i in self.final_training_columns:
if i not in data.columns:
sys.exit('(Type Error): test data does not have column ' + str(i) + " which was used for training")
## we only need to take test columns that we used in ttaining (test in production may have a lot more columns)
data = data[self.final_training_columns]
# just keep picking the data and keep applying to the test data set (be mindful of target variable)
for i in data.columns: # we are taking all the columns in test , so we dot have to worry about droping target column
data[i] = data[i].astype(self.learent_dtypes[self.learent_dtypes.index==i])
# drop time columns
#data.drop(self.drop_time,axis=1,errors='ignore',inplace=True)
# drop id columns
data.drop(self.id_columns,axis=1,errors='ignore',inplace=True)
# drop custome columns
data.drop(self.features_todrop,axis=1,errors='ignore',inplace=True)
return(data)
# fit_transform
def fit_transform(self,dataset,y=None):
data= dataset.copy()
# since this is for training , we dont nees any transformation since it has already been transformed in fit
data = self.fit(data)
# additionally we just need to treat the target variable
# for ml use ase
if ((self.ml_usecase == 'classification') & (data[self.target].dtype=='object')):
le = LabelEncoder()
data[self.target] = le.fit_transform(np.array(data[self.target]))
# now get the replacement dict
rev= le.inverse_transform(range(0,len(le.classes_)))
rep = np.array(range(0,len(le.classes_)))
self.replacement={}
for i,k in zip(rev,rep):
self.replacement[i] = k
# self.u = list(pd.unique(data[self.target]))
# self.replacement = np.arange(0,len(self.u))
# data[self.target]= data[self.target].replace(self.u,self.replacement)
# data[self.target] = data[self.target].astype('int64')
# self.replacement = pd.DataFrame(dict(target_variable=self.u,replaced_with=self.replacement))
# drop time columns
#data.drop(self.drop_time,axis=1,errors='ignore',inplace=True)
# drop id columns
data.drop(self.id_columns,axis=1,errors='ignore',inplace=True)
# drop custome columns
data.drop(self.features_todrop,axis=1,errors='ignore',inplace=True)
# finally save a list of columns that we would need from test data set
self.final_training_columns = data.drop(self.target,axis=1).columns
return(data)
# _______________________________________________________________________________________________________________________
# Imputation
class Simple_Imputer(BaseEstimator,TransformerMixin):
'''
Imputes all type of data (numerical,categorical & Time).
Highly recommended to run Define_dataTypes class first
Numerical values can be imputed with mean or median
categorical missing values will be replaced with "Other"
Time values are imputed with the most frequesnt value
Ignores target (y) variable
Args:
Numeric_strategy: string , all possible values {'mean','median'}
categorical_strategy: string , all possible values {'not_available','most frequent'}
target: string , name of the target variable
'''
def __init__(self,numeric_strategy,categorical_strategy,target_variable):
self.numeric_strategy = numeric_strategy
self.target = target_variable
self.categorical_strategy = categorical_strategy
def fit(self,dataset,y=None): #
data = dataset.copy()
# make a table for numerical variable with strategy stats
if self.numeric_strategy == 'mean':
self.numeric_stats = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).apply(np.nanmean)
else:
self.numeric_stats = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).apply(np.nanmedian)
self.numeric_columns = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).columns
#for Catgorical ,
if self.categorical_strategy == 'most frequent':
self.categorical_columns = data.drop(self.target,axis=1).select_dtypes(include=['object']).columns
self.categorical_stats = pd.DataFrame(columns=self.categorical_columns) # place holder
for i in (self.categorical_stats.columns):
self.categorical_stats.loc[0,i] = data[i].value_counts().index[0]
else:
self.categorical_columns = data.drop(self.target,axis=1).select_dtypes(include=['object']).columns
# for time, there is only one way, pick up the most frequent one
self.time_columns = data.drop(self.target,axis=1).select_dtypes(include=['datetime64[ns]']).columns
self.time_stats = pd.DataFrame(columns=self.time_columns) # place holder
for i in (self.time_columns):
self.time_stats.loc[0,i] = data[i].value_counts().index[0]
return(data)
def transform(self,dataset,y=None):
data = dataset.copy()
# for numeric columns
for i,s in zip(data[self.numeric_columns].columns,self.numeric_stats):
data[i].fillna(s,inplace=True)
# for categorical columns
if self.categorical_strategy == 'most frequent':
for i in (self.categorical_stats.columns):
#data[i].fillna(self.categorical_stats.loc[0,i],inplace=True)
data[i] = data[i].fillna(self.categorical_stats.loc[0,i])
data[i] = data[i].apply(str)
else: # this means replace na with "not_available"
for i in (self.categorical_columns):
data[i].fillna("not_available",inplace=True)
data[i] = data[i].apply(str)
# for time
for i in (self.time_stats.columns):
data[i].fillna(self.time_stats.loc[0,i],inplace=True)
return(data)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
data= self.fit(data)
return(self.transform(data))
# _______________________________________________________________________________________________________________________
# Imputation with surrogate columns
class Surrogate_Imputer(BaseEstimator,TransformerMixin):
'''
Imputes feature with surrogate column (numerical,categorical & Time).
- Highly recommended to run Define_dataTypes class first
- it is also recommended to only apply this to features where it makes business sense to creat surrogate column
- feature name has to be provided
- only able to handle one feature at a time
- Numerical values can be imputed with mean or median
- categorical missing values will be replaced with "Other"
- Time values are imputed with the most frequesnt value
- Ignores target (y) variable
Args:
feature_name: string, provide features name
feature_type: string , all possible values {'numeric','categorical','date'}
strategy: string ,all possible values {'mean','median','not_available','most frequent'}
target: string , name of the target variable
'''
def __init__(self,numeric_strategy,categorical_strategy,target_variable):
self.numeric_strategy = numeric_strategy
self.target = target_variable
self.categorical_strategy = categorical_strategy
def fit(self,dataset,y=None): #
data = dataset.copy()
# make a table for numerical variable with strategy stats
if self.numeric_strategy == 'mean':
self.numeric_stats = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).apply(np.nanmean)
else:
self.numeric_stats = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).apply(np.nanmedian)
self.numeric_columns = data.drop(self.target,axis=1).select_dtypes(include=['float64','int64']).columns
# also need to learn if any columns had NA in training
self.numeric_na = pd.DataFrame(columns=self.numeric_columns)
for i in self.numeric_columns:
if data[i].isna().any() == True:
self.numeric_na.loc[0,i] = True
else:
self.numeric_na.loc[0,i] = False
#for Catgorical ,
if self.categorical_strategy == 'most frequent':
self.categorical_columns = data.drop(self.target,axis=1).select_dtypes(include=['object']).columns
self.categorical_stats = pd.DataFrame(columns=self.categorical_columns) # place holder
for i in (self.categorical_stats.columns):
self.categorical_stats.loc[0,i] = data[i].value_counts().index[0]
# also need to learn if any columns had NA in training, but this is only valid if strategy is "most frequent"
self.categorical_na = pd.DataFrame(columns=self.categorical_columns)
for i in self.categorical_columns:
if sum(data[i].isna()) > 0:
self.categorical_na.loc[0,i] = True
else:
self.categorical_na.loc[0,i] = False
else:
self.categorical_columns = data.drop(self.target,axis=1).select_dtypes(include=['object']).columns
self.categorical_na = pd.DataFrame(columns=self.categorical_columns)
self.categorical_na.loc[0,:] = False #(in this situation we are not making any surrogate column)
# for time, there is only one way, pick up the most frequent one
self.time_columns = data.drop(self.target,axis=1).select_dtypes(include=['datetime64[ns]']).columns
self.time_stats = pd.DataFrame(columns=self.time_columns) # place holder
self.time_na = pd.DataFrame(columns=self.time_columns)
for i in (self.time_columns):
self.time_stats.loc[0,i] = data[i].value_counts().index[0]
# learn if time columns were NA
for i in self.time_columns:
if data[i].isna().any() == True:
self.time_na.loc[0,i] = True
else:
self.time_na.loc[0,i] = False
return(data) # nothing to return
def transform(self,dataset,y=None):
data = dataset.copy()
# for numeric columns
for i,s in zip(data[self.numeric_columns].columns,self.numeric_stats):
array = data[i].isna()
data[i].fillna(s,inplace=True)
# make a surrogate column if there was any
if self.numeric_na.loc[0,i] == True:
data[i+"_surrogate"]= array
# make it string
data[i+"_surrogate"]= data[i+"_surrogate"].apply(str)
# for categorical columns
if self.categorical_strategy == 'most frequent':
for i in (self.categorical_stats.columns):
#data[i].fillna(self.categorical_stats.loc[0,i],inplace=True)
array = data[i].isna()
data[i] = data[i].fillna(self.categorical_stats.loc[0,i])
data[i] = data[i].apply(str)
# make surrogate column
if self.categorical_na.loc[0,i] == True:
data[i+"_surrogate"]= array
# make it string
data[i+"_surrogate"]= data[i+"_surrogate"].apply(str)
else: # this means replace na with "not_available"
for i in (self.categorical_columns):
data[i].fillna("not_available",inplace=True)
data[i] = data[i].apply(str)
# no need to make surrogate since not_available is itself a new colum
# for time
for i in (self.time_stats.columns):
array = data[i].isna()
data[i].fillna(self.time_stats.loc[0,i],inplace=True)
# make surrogate column
if self.time_na.loc[0,i] == True:
data[i+"_surrogate"]= array
# make it string
data[i+"_surrogate"]= data[i+"_surrogate"].apply(str)
return(data)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
data= self.fit(data)
return(self.transform(data))
# _______________________________________________________________________________________________________________________
# Zero and Near Zero Variance
class Zroe_NearZero_Variance(BaseEstimator,TransformerMixin):
'''
- it eliminates the features having zero variance
- it eliminates the features haveing near zero variance
- Near zero variance is determined by
-1) Count of unique points divided by the total length of the feature has to be lower than a pre sepcified threshold
-2) Most common point(count) divided by the second most common point(count) in the feature is greater than a pre specified threshold
Once both conditions are met , the feature is dropped
-Ignores target variable
Args:
threshold_1: float (between 0.0 to 1.0) , default is .10
threshold_2: int (between 1 to 100), default is 20
tatget variable : string, name of the target variable
'''
def __init__(self,target,threshold_1=0.1,threshold_2=20):
self.threshold_1 = threshold_1
self.threshold_2 = threshold_2
self.target = target
def fit(self,dataset,y=None): # from training data set we are going to learn what columns to drop
data = dataset.copy()
self.to_drop= []
self.sampl_len = len(data[self.target])
for i in data.drop(self.target,axis=1).columns:
# get the number of unique counts
u = pd.DataFrame( data[i].value_counts()).sort_values(by=i,ascending=False, inplace=False)
# take len of u and divided it by the total sample numbers, so this will check the 1st rule , has to be low say 10%
#import pdb; pdb.set_trace()
first=len(u)/self.sampl_len
# then check if most common divided by 2nd most common ratio is 20 or more
if len(u[i]) == 1: # this means that if column is non variance , automatically make the number big to drop it
second=100
else:
second = u.iloc[0,0]/u.iloc[1,0]
# if both conditions are true then drop the column, however, we dont want to alter column that indicate NA's
if ((first <= 0.10) and (second >=20) and (i[-10:]!='_surrogate')):
self.to_drop.append(i)
# now drop if the column has zero variance
if (((second ==100) and (i[-10:]!='_surrogate'))):
self.to_drop.append(i)
def transform(self,dataset,y=None): # since it is only for training data set , nothing here
data= dataset.copy()
data.drop(self.to_drop,axis=1,inplace=True)
return(data)
def fit_transform(self,dataset,y=None):
data= dataset.copy()
self.fit(data)
return(self.transform(data))
#____________________________________________________________________________________________________________________________
# rare catagorical variables
class Catagorical_variables_With_Rare_levels(BaseEstimator,TransformerMixin):
'''
-Merges levels in catagorical features with more frequent level if they appear less than a threshold count
e.g. Col=[a,a,a,a,b,b,c,c]
if threshold is set to 2 , then c will be mrged with b because both are below threshold
There has to be atleast two levels belwo threshold for this to work
the process will keep going until all the levels have atleast 2(threshold) counts
-Only handles catagorical features
-It is recommended to run the Zroe_NearZero_Variance and Define_dataTypes first
-Ignores target variable
Args:
threshold: int , default 10
target: string , name of the target variable
new_level_name: string , name given to the new level generated, default 'others'
'''
def __init__(self,target,new_level_name='others_infrequent',threshold=.05):
self.threshold = threshold
self.target = target
self.new_level_name = new_level_name
def fit(self,dataset,y=None): # we will learn for what columnns what are the level to merge as others
# every level of the catagorical feature has to be more than threshols, if not they will be clubed togather as "others"
# in order to apply, there should be atleast two levels belwo the threshold !
# creat a place holder
data = dataset.copy()
self.ph = pd.DataFrame(columns=data.drop(self.target,axis=1).select_dtypes(include="object").columns)
#ph.columns = df.columns# catagorical only
for i in data[self.ph.columns].columns:
# determine the infrequebt count
v_c = data[i].value_counts()
count_th = round(v_c.quantile(self.threshold))
a = np.sum(pd.DataFrame(data[i].value_counts().sort_values()) [i] <= count_th)
if a >= 2: # rare levels has to be atleast two
count = pd.DataFrame( data[i].value_counts().sort_values())
count.columns = ['fre']
count = count[count['fre']<=count_th]
to_club = list(count.index)
self.ph.loc[0,i] = to_club
else:
self.ph.loc[0,i] = []
# # also need to make a place holder that keep records of all the levels , and in case a new level appears in test we will change it to others
# self.ph_level = pd.DataFrame(columns=data.drop(self.target,axis=1).select_dtypes(include="object").columns)
# for i in self.ph_level.columns:
# self.ph_level.loc[0,i] = list(data[i].value_counts().sort_values().index)
def transform(self,dataset,y=None): #
# transorm
data = dataset.copy()
for i in data[self.ph.columns].columns:
t_replace = self.ph.loc[0,i]
data[i].replace(to_replace=t_replace,value=self.new_level_name,inplace=True)
return(data)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
self.fit(data)
return(self.transform(data))
# _______________________________________________________________________________________________________________________
# new catagorical level in test
class New_Catagorical_Levels_in_TestData(BaseEstimator,TransformerMixin):
'''
-This treats if a new level appears in the test dataset catagorical's feature (i.e a level on whihc model was not trained previously)
-It simply replaces the new level in test data set with the most frequent or least frequent level in the same feature in the training data set
-It is recommended to run the Zroe_NearZero_Variance and Define_dataTypes first
-Ignores target variable
Args:
target: string , name of the target variable
replacement_strategy:string , 'least frequent' or 'most frequent' (default 'most frequent' )
'''
def __init__(self,target,replacement_strategy='most frequent'):
self.target = target
self.replacement_strategy = replacement_strategy
def fit(self,data,y=None):
# need to make a place holder that keep records of all the levels , and in case a new level appears in test we will change it to others
self.ph_train_level = pd.DataFrame(columns=data.drop(self.target,axis=1).select_dtypes(include="object").columns)
for i in self.ph_train_level.columns:
if self.replacement_strategy == "least frequent":
self.ph_train_level.loc[0,i] = list(data[i].value_counts().sort_values().index)
else:
self.ph_train_level.loc[0,i] = list(data[i].value_counts().index)
def transform(self,data,y=None): #
# transorm
# we need to learn the same for test data , and then we will compare to check what levels are new in there
self.ph_test_level = pd.DataFrame(columns=data.drop(self.target,axis=1,errors='ignore').select_dtypes(include="object").columns)
for i in self.ph_test_level.columns:
self.ph_test_level.loc[0,i] = list(data[i].value_counts().sort_values().index)
# new we have levels for both test and train, we will start comparing and replacing levels in test set (Only if test set has new levels)
for i in self.ph_test_level.columns:
new = list((set(self.ph_test_level.loc[0,i]) - set(self.ph_train_level.loc[0,i])))
# now if there is a difference , only then replace it
if len(new) > 0 :
data[i].replace(new,self.ph_train_level.loc[0,i][0],inplace=True)
return(data)
def fit_transform(self,data,y=None): #There is no transformation happening in training data set, its all about test
self.fit(data)
return(data)
# _______________________________________________________________________________________________________________________
# Group akin features
class Group_Similar_Features(BaseEstimator,TransformerMixin):
'''
- Given a list of features , it creates aggregate features
- features created are Min, Max, Mean, Median, Mode & Std
- Only works on numerical features
Args:
list_of_similar_features: list of list, string , e.g. [['col',col2],['col3','col4']]
group_name: list, group name/names to be added as prefix to aggregate features, e.g ['gorup1','group2']
'''
def __init__(self,group_name=[],list_of_grouped_features=[[]]):
self.list_of_similar_features = list_of_grouped_features
self.group_name = group_name
# if list of list not given
try:
np.array(self.list_of_similar_features).shape[0]
except:
raise("Group_Similar_Features: list_of_grouped_features is not provided as list of list")
def fit(self,data,y=None):
# nothing to learn
return(None)
def transform(self,dataset,y=None):
data = dataset.copy()
# # only going to process if there is an actual missing value in training data set
if len(self.list_of_similar_features) > 0:
for f,g in zip(self.list_of_similar_features,self.group_name):
data[g+'_Min'] = data[f].apply(np.min,1)
data[g+'_Max'] = data[f].apply(np.max,1)
data[g+'_Mean'] = data[f].apply(np.mean,1)
data[g+'_Median'] = data[f].apply(np.median,1)
data[g+'_Mode'] = stats.mode(data[f],1)[0]
data[g+'_Std'] = data[f].apply(np.std,1)
return(data)
else:
return(data)
def fit_transform(self,data,y=None):
self.fit(data)
return(self.transform(data))
#____________________________________________________________________________________________________________________________________________________________________
# Binning for Continious
class Binning(BaseEstimator,TransformerMixin):
'''
- Converts numerical variables to catagorical variable through binning
- Number of binns are automitically determined through Sturges method
- Once discretize, original feature will be dropped
Args:
features_to_discretize: list of featur names to be binned
'''
def __init__(self, features_to_discretize):
self.features_to_discretize =features_to_discretize
return(None)
def fit(self,data,y=None):
return(None)
def transform(self,dataset,y=None):
data = dataset.copy()
#only do if features are provided
if len(self.features_to_discretize) > 0:
data_t = self.disc.transform(np.array(data[self.features_to_discretize]).reshape(-1,self.len_columns))
# make pandas data frame
data_t = pd.DataFrame(data_t,columns=self.features_to_discretize,index=data.index)
# all these columns are catagorical
data_t = data_t.astype(str)
# drop original columns
data.drop(self.features_to_discretize,axis=1,inplace=True)
# add newly created columns
data = pd.concat((data,data_t),axis=1)
return(data)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
# only do if features are given
if len(self.features_to_discretize) > 0:
# place holder for all the features for their binns
self.binns = []
for i in self.features_to_discretize:
# get numbr of binns
hist, bin_edg = np.histogram(data[i],bins='sturges')
self.binns.append(len(hist))
# how many colums to deal with
self.len_columns = len(self.features_to_discretize)
# now do fit transform
self.disc = KBinsDiscretizer(n_bins=self.binns, encode='ordinal', strategy='kmeans')
data_t = self.disc.fit_transform(np.array(data[self.features_to_discretize]).reshape(-1,self.len_columns))
# make pandas data frame
data_t = pd.DataFrame(data_t,columns=self.features_to_discretize,index=data.index)
# all these columns are catagorical
data_t = data_t.astype(str)
# drop original columns
data.drop(self.features_to_discretize,axis=1,inplace=True)
# add newly created columns
data = pd.concat((data,data_t),axis=1)
return(data)
# ______________________________________________________________________________________________________________________
# Scaling & Power Transform
class Scaling_and_Power_transformation(BaseEstimator,TransformerMixin):
'''
-Given a data set, applies Min Max, Standar Scaler or Power Transformation (yeo-johnson)
-it is recommended to run Define_dataTypes first
- ignores target variable
Args:
target: string , name of the target variable
function_to_apply: string , default 'zscore' (standard scaler), all other {'minmaxm','yj','quantile','robust','maxabs'} ( min max,yeo-johnson & quantile power transformation, robust and MaxAbs scaler )
'''
def __init__(self,target,function_to_apply='zscore',random_state_quantile=42):
self.target = target
self.function_to_apply = function_to_apply
self.random_state_quantile = random_state_quantile
# self.transform_target = transform_target
# self.ml_usecase = ml_usecase
def fit(self,dataset,y=None):
data=dataset.copy()
# we only want to apply if there are numeric columns
self.numeric_features = data.drop(self.target,axis=1,errors='ignore').select_dtypes(include=["float64",'int64']).columns
if len(self.numeric_features) > 0:
if self.function_to_apply == 'zscore':
self.scale_and_power = StandardScaler()
self.scale_and_power.fit(data[self.numeric_features])
elif self.function_to_apply == 'minmax':
self.scale_and_power = MinMaxScaler()
self.scale_and_power.fit(data[self.numeric_features])
elif self.function_to_apply == 'yj':
self.scale_and_power = PowerTransformer(method='yeo-johnson',standardize=True)
self.scale_and_power.fit(data[self.numeric_features])
elif self.function_to_apply == 'quantile':
self.scale_and_power = QuantileTransformer(random_state=self.random_state_quantile,output_distribution='normal')
self.scale_and_power.fit(data[self.numeric_features])
elif self.function_to_apply == 'robust':
self.scale_and_power = RobustScaler()
self.scale_and_power.fit(data[self.numeric_features])
elif self.function_to_apply == 'maxabs':
self.scale_and_power = MaxAbsScaler()
self.scale_and_power.fit(data[self.numeric_features])
else:
return(None)
else:
return(None)
def transform(self,dataset,y=None):
data = dataset.copy()
if len(self.numeric_features) > 0:
self.data_t = pd.DataFrame(self.scale_and_power.transform(data[self.numeric_features]))
# we need to set the same index as original data
self.data_t.index = data.index
self.data_t.columns = self.numeric_features
for i in self.numeric_features:
data[i]= self.data_t[i]
return(data)
else:
return(data)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
self.fit(data)
# convert target if appropriate
# default behavious is quantile transformer
# if ((self.ml_usecase == 'regression') and (self.transform_target == True)):
# self.scale_and_power_target = QuantileTransformer(random_state=self.random_state_quantile,output_distribution='normal')
# data[self.target]=self.scale_and_power_target.fit_transform(np.array(data[self.target]).reshape(-1,1))
return(self.transform(data))
# ______________________________________________________________________________________________________________________
# Scaling & Power Transform
class Target_Transformation(BaseEstimator,TransformerMixin):
'''
- Applies Power Transformation (yeo-johnson , Box-Cox) to target variable (Applicable to Regression only)
- 'bc' for Box_Coc & 'yj' for yeo-johnson, default is Box-Cox
- if target containes negtive / zero values , yeo-johnson is automatically selected
'''
def __init__(self,target,function_to_apply='bc'):
self.target = target
self.function_to_apply = function_to_apply
if self.function_to_apply == 'bc':
self.function_to_apply = 'box-cox'
else:
self.function_to_apply = 'yeo-johnson'
def fit(self,dataset,y=None):
return(None)
def transform(self,dataset,y=None):
return(dataset)
def fit_transform(self,dataset,y=None):
data = dataset.copy()
# if target has zero or negative values use yj instead
if any(data[self.target]<= 0):
self.function_to_apply = 'yeo-johnson'
# apply transformation
self.p_transform_target = PowerTransformer(method=self.function_to_apply)
data[self.target]=self.p_transform_target.fit_transform(np.array(data[self.target]).reshape(-1,1))
return(data)
# __________________________________________________________________________________________________________________________
# Time feature extractor
class Make_Time_Features(BaseEstimator,TransformerMixin):
'''
-Given a time feature , it extracts more features
- Only accepts / works where feature / data type is datetime64[ns]
- full list of features is:
['month','weekday',is_month_end','is_month_start','hour']
- all extracted features are defined as string / object
-it is recommended to run Define_dataTypes first
Args:
time_feature: list of feature names as datetime64[ns] , default empty/none , if empty/None , it will try to pickup dates automatically where data type is datetime64[ns]
list_of_features: list of required features , default value ['month','weekday','is_month_end','is_month_start','hour']
'''
def __init__(self,time_feature=[],list_of_features=['month','weekday','is_month_end','is_month_start','hour']):
self.time_feature = time_feature
self.list_of_features_o = list_of_features
return(None)
def fit(self,data,y=None):
return(None)
def transform(self,dataset,y=None):
data = dataset.copy()
# run fit transform first
# start making features for every column in the time list
for i in self.time_feature:
# make month column if month is choosen
if 'month' in self.list_of_features_o:
data[i+"_month"] = [datetime.date(r).month for r in data[i]]
data[i+"_month"] = data[i+"_month"].apply(str)
# make weekday column if weekday is choosen ( 0 for monday 6 for sunday)
if 'weekday' in self.list_of_features_o:
data[i+"_weekday"] = [datetime.weekday(r) for r in data[i]]
data[i+"_weekday"] = data[i+"_weekday"].apply(str)