In [57]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
import pickle

In [58]:
train = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-aug-2022/sample_submission.csv')

In [59]:
data = pd.concat([train, test])
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['loading'] = np.log1p(data['loading'])

In [60]:
feature = [f for f in test.columns if f.startswith('measurement') or f=='loading']

fill_dict = {
    'A': ['measurement_5','measurement_6','measurement_8'],
    'B': ['measurement_4','measurement_5','measurement_7'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8']
}

for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    column = fill_dict[code]
    tmp_train = tmp[column+['measurement_17']].dropna(how='any')
    tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]
    model = HuberRegressor()
    model.fit(tmp_train[column], tmp_train['measurement_17'])
    data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data['measurement_17'].isnull()), 'measurement_17'] = model.predict(tmp_test[column])
    model2 = KNNImputer(n_neighbors=5)
    data.loc[data.product_code==code, feature] = model2.fit_transform(data.loc[data.product_code==code, feature])

code A has 386 samples to fill nan
KNN imputing code A
code B has 418 samples to fill nan
KNN imputing code B
code C has 391 samples to fill nan
KNN imputing code C
code D has 398 samples to fill nan
KNN imputing code D
code E has 429 samples to fill nan
KNN imputing code E
code F has 420 samples to fill nan
KNN imputing code F
code G has 373 samples to fill nan
KNN imputing code G
code H has 361 samples to fill nan
KNN imputing code H
code I has 377 samples to fill nan
KNN imputing code I


In [61]:
train = data[data.failure.notnull()]
X = train.drop(['failure'], axis=1)
Y = train['failure'].astype(int)
test = data[data.failure.isnull()]
test = test.drop(['failure'], axis=1)

(26570, 28) (20775, 27)


In [62]:
lr1 = np.zeros(len(train))
lr2 = np.zeros(len(train))
lr_test = np.zeros(len(test))

select_feature_1 = pickle.load(open("../input/111111/outfile0.p", "rb"))#load the feature list

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, Y)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    x_test = test.copy()
    scaler = StandardScaler()
   
    scaled_train = scaler.fit_transform(x_train[select_feature_1])
    scaled_val = scaler.transform(x_val[select_feature_1])
    scaled_test = scaler.transform(x_test[select_feature_1])
    
    train_copy = x_train.copy()
    val_copy = x_val.copy()
    test_copy = x_test.copy()
    
    train_copy[select_feature_1] = scaled_train
    val_copy[select_feature_1] = scaled_val
    test_copy[select_feature_1] = scaled_test
    
    assert len(x_train) == len(train_copy)
    assert len(x_val) == len(val_copy)
    assert len(x_test) == len(test_copy)
    x_train =  train_copy
    x_val =  val_copy
    x_test = test_copy
    
    model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg')
    model.fit(x_train[select_feature_1], y_train)
    

    val_preds = model.predict_proba(x_val[select_feature_1])[:, 1]
    
    y_preds = model.predict(x_val[select_feature_1])
    
    lr_test += model.predict_proba(x_test[select_feature_1])[:, 1] / 5
    lr1[val_idx] = val_preds
    lr2[val_idx] = y_preds



In [63]:
submission['lr0'] = lr_test

In [64]:
lr1 = np.zeros(len(train))
lr2 = np.zeros(len(train))
lr_test = np.zeros(len(test))

select_feature_2 = pickle.load(open("../input/111111/outfile1.p", "rb"))#load the feature list

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, Y)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    x_test = test.copy()
    
    scaler = StandardScaler()
   
    scaled_train = scaler.fit_transform(x_train[select_feature_2])
    scaled_val = scaler.transform(x_val[select_feature_2])
    scaled_test = scaler.transform(x_test[select_feature_2])
    
    train_copy = x_train.copy()
    val_copy = x_val.copy()
    test_copy = x_test.copy()
    
    train_copy[select_feature_2] = scaled_train
    val_copy[select_feature_2] = scaled_val
    test_copy[select_feature_2] = scaled_test
    
    assert len(x_train) == len(train_copy)
    assert len(x_val) == len(val_copy)
    assert len(x_test) == len(test_copy)
    x_train =  train_copy
    x_val =  val_copy
    x_test = test_copy
    
    model = LogisticRegression(max_iter=1000, C=0.0001, penalty='l2', solver='newton-cg') # , class_weight='balanced'
    model.fit(x_train[select_feature_2], y_train)
    
    val_preds = model.predict_proba(x_val[select_feature_2])[:, 1]
   
    y_preds = model.predict(x_val[select_feature_2])
    
    lr_test += model.predict_proba(x_test[select_feature_2])[:, 1] / 5
    lr1[val_idx] = val_preds
    lr2[val_idx] = y_preds

In [65]:
submission['lr1'] = lr_test

In [66]:
submission.head()

Unnamed: 0,id,failure,lr0,lr1
0,26570,0.0,0.208906,0.208996
1,26571,0.0,0.201129,0.201224
2,26572,0.0,0.204884,0.204971
3,26573,0.0,0.206612,0.206694
4,26574,0.0,0.23922,0.239314


In [67]:
submission['rank0'] = rankdata(submission['lr0'])
submission['rank1'] = rankdata(submission['lr1'])

In [75]:
submission['failure'] = submission['rank0']*0.75+ submission['rank1']*0.25

In [76]:
submission.head()

Unnamed: 0,id,failure,lr0,lr1,rank0,rank1
0,26570,8719.5,0.208906,0.208996,8721.0,8715.0
1,26571,4704.5,0.201129,0.201224,4711.0,4685.0
2,26572,6517.25,0.204884,0.204971,6523.0,6500.0
3,26573,7450.5,0.206612,0.206694,7457.0,7431.0
4,26574,20034.5,0.23922,0.239314,20018.0,20084.0


In [77]:
submission[['id', 'failure']].to_csv('submission.csv', index=False)