In [311]:
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

#### Dataset содержащий данные о нестабильностях в сетях электроснабжения.
    Описание данных - https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+#

In [312]:
df = pd.read_csv("Data_for_UCI_named.csv")
df.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,unstable


### Проведем feature engineering
#### Посмотрим на распределение классов:

In [313]:
df['stabf'].value_counts()

unstable    6380
stable      3620
Name: stabf, dtype: int64

#### Приведем поле stabf к бинарному виду, где значение unstable означает не стабилиный, а  stable стабильный.

In [314]:
df.loc[df['stabf']=="unstable",'stabf'] = 0
df.loc[df['stabf']=="stable",'stabf'] = 1
df.head(3)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,0


#### Проанализируем Dataset на наличие пропущенных данных и на их соответствие типам данных.

In [315]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  object 
dtypes: float64(13), object(1)
memory usage: 1.1+ MB


Приведем тип данных object, столбца «stabf» к типу int

In [316]:
df['stabf'] = df['stabf'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   tau1    10000 non-null  float64
 1   tau2    10000 non-null  float64
 2   tau3    10000 non-null  float64
 3   tau4    10000 non-null  float64
 4   p1      10000 non-null  float64
 5   p2      10000 non-null  float64
 6   p3      10000 non-null  float64
 7   p4      10000 non-null  float64
 8   g1      10000 non-null  float64
 9   g2      10000 non-null  float64
 10  g3      10000 non-null  float64
 11  g4      10000 non-null  float64
 12  stab    10000 non-null  float64
 13  stabf   10000 non-null  int32  
dtypes: float64(13), int32(1)
memory usage: 1.0 MB


In [331]:
#соберем pipeline
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Класс возвращает выбранный столбец Dataframe. Выбор определяется именем столбца содержащимся в параметре key.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [318]:
categorical_columns = list(df.columns[:-1]) #Имена столбцов для обработки в pipeline NumberSelector
final_transformers = list()
for cat_col in categorical_columns:
    pipeline =  Pipeline([('selector', NumberSelector(key=cat_col)),
                          ('standard', StandardScaler())
                         ])
    final_transformers.append((cat_col, pipeline))

In [319]:
#разделим данные на train/test и Удалим признак CustomerId,Exited
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df['stabf'], random_state=0)

In [320]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])# Итоговый pipeline для обработки данный Dataframe
feature_processing.fit_transform(X_train)

array([[-1.64853458,  0.10761394, -1.5913945 , ..., -0.34985854,
        -1.39504804, -1.42009312],
       [ 0.20738286,  0.25867125,  1.34842558, ..., -1.4503097 ,
        -1.04449708,  1.0875973 ],
       [ 1.70997874,  0.76184697,  0.25611334, ...,  0.43472247,
        -0.85337194,  0.56378675],
       ...,
       [-1.17437952,  0.41854392,  0.82330869, ...,  0.16042152,
        -1.35518334, -0.63353745],
       [-1.60308121,  0.15727679, -1.34852594, ...,  1.2011598 ,
        -0.44667147, -1.96119401],
       [ 1.10139233, -1.26879381,  0.59608872, ...,  1.26963642,
        -1.04238586,  0.38818401]])

#### Соберем результирующий Pipeline

In [321]:
PipelineRandomForestClassifier = Pipeline([('features', feats),
                                           ('classifier', RandomForestClassifier(random_state=42))])

#### Проведем обучение модели «Случайный лес»

In [322]:
PipelineRandomForestClassifier.fit(X_train, y_train)    

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('tau1',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='tau1')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('tau2',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='tau2')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('tau3',
                                                 Pipeline(steps=[('selector',
                                            

Делаем предсказание

In [323]:
y_predict = PipelineRandomForestClassifier.predict(X_test)

In [324]:
def evaluate_results(y_test, y_predict):
    """
    Функция рассчитывает соответствующие метрики    
    """
    return {'f1':f1_score(y_test, y_predict),
             'roc':roc_auc_score(y_test, y_predict),
             'rec':recall_score(y_test, y_predict, average='binary'),
             'prc':precision_score(y_test, y_predict, average='binary')}

#### Рассчитаем метрики и сведем их в таблицу.

In [325]:
metrics_model = evaluate_results(y_test, y_predict)
table_res = pd.DataFrame({"RandomForestClassifier":metrics_model})
table_res

Unnamed: 0,RandomForestClassifier
f1,0.999463
prc,0.998927
rec,1.0
roc,0.999681


### Представим данную задачу как задачу lookalike и решим ее методом random negative sampling
    Определим долю позитивных примеров

In [326]:
P = 0.25 # доля позитивных примеров
mod_data = df.copy()
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(P * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 905/3620 as positives and unlabeling the rest


Присоединем дополнительный столбец

In [327]:
mod_data['stabf_test'] = 0
mod_data.loc[pos_sample,'stabf_test'] = 1
print('target variable:\n', mod_data['stabf_test'].value_counts())
mod_data.head(3)

target variable:
 0    9095
1     905
Name: stabf_test, dtype: int64


Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,stabf_test
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,0,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,1,0
2,8.971707,8.848428,3.046479,1.214518,3.405158,-1.207456,-1.27721,-0.920492,0.163041,0.766689,0.839444,0.109853,0.003471,0,0


In [328]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['stabf_test']==0][:len(mod_data[mod_data['stabf_test']==1])]
sample_test = mod_data[mod_data['stabf_test']==0][len(mod_data[mod_data['stabf_test']==1]):]
pos_sample = mod_data[mod_data['stabf_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(905, 15) (905, 15)


#### Обучим модель

In [329]:
PipelineRandomForestClassifier.fit(sample_train.iloc[:,:-2], sample_train['stabf_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('tau1',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='tau1')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('tau2',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='tau2')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('tau3',
                                                 Pipeline(steps=[('selector',
                                            

In [330]:
y_predict = PipelineRandomForestClassifier.predict(sample_test.iloc[:,:-2])
metrics_model = evaluate_results(sample_test['stabf'].values, y_predict)
table_res['Random negative sampling']= pd.DataFrame({"RandomForestClassifier":metrics_model})
table_res

Unnamed: 0,RandomForestClassifier,Random negative sampling
f1,0.999463,0.994022
prc,0.998927,1.0
rec,1.0,0.988115
roc,0.999681,0.994057


In [None]:
Модель хорошо справилась с поиском похожих объектов.