In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_regression, chi2, SelectPercentile, mutual_info_regression
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor

In [2]:
SEED = 4426008 #np.random.randint(2**32)
np.random.seed(SEED)

In [3]:
def full_X(x_tr = None, x_ts = None):
  if x_tr is None:
    x_tr = x_train
  if x_ts is None:
    x_ts = x_test
  return pd.concat([x_tr, x_ts], join = "inner")

def full_nans(nans_tr = None, nans_ts = None):
  if nans_tr is None:
    nans_tr = nans_train
  if nans_ts is None:
    nans_ts = nans_test
  return pd.concat([nans_tr, nans_ts], join = "inner")

def X_with_na(x = None, nans = None):
    if x is None:
        x = full_X()
    if nans is None:
        nans = full_nans()
    return pd.concat([x, nans], axis=1)

In [4]:
just_scoring = False
if just_scoring:
  folder = "just_scoring/"
else:
  folder = "testing/"
folder = folder + ""
raw = folder + "nan_filled/"
preprocessed = folder + "outlier_detected/"

In [5]:
x_train = pd.read_csv(raw + 'X_train_preprocessed.csv',index_col=['id'])
x_test = pd.read_csv(raw + 'X_test_preprocessed.csv',index_col=['id'])
y_train = pd.read_csv(raw + 'y_train_preprocessed.csv',index_col=['id'])
y_test = pd.read_csv(raw + 'y_test_preprocessed.csv',index_col=['id'])

In [6]:
nans_train = pd.read_csv(raw + 'nans_train_preprocessed.csv',index_col=['id'])
nans_test = pd.read_csv(raw + 'nans_test_preprocessed.csv',index_col=['id'])

In [7]:
s = RobustScaler()
s.fit(full_X())
x_train[[col for col in x_train.columns]] = s.transform(x_train)
x_test[[col for col in x_train.columns]] = s.transform(x_test)

In [8]:
%%time
x_train_old = x_train.copy()

for i in range(2):
    print(i)
    if i>=0.5:
        model=IsolationForest(n_estimators=5000, random_state=np.random.randint(0, 2**31), n_jobs=-1)
        model.fit(np.array(full_X()))
        is_anomaly = model.predict(np.array(x_train))
    else:
        nei_size = np.floor(np.sqrt(full_X().shape[0])).astype(int)
        model=LocalOutlierFactor(n_neighbors = nei_size, leaf_size = nei_size*1.5)
        is_anomaly = model.fit_predict(full_X())[:x_train.shape[0]]
    x_train['anomaly'] = is_anomaly
    y_train['anomaly'] = is_anomaly
    x_train = x_train[x_train.anomaly > 0]
    y_train = y_train[y_train.anomaly > 0]
    x_train = x_train.drop('anomaly',axis=1)
    y_train = y_train.drop('anomaly',axis=1)

    while x_train_old.shape[0]!=x_train.shape[0]:
        print(x_train.shape)
        print(x_train_old.shape)
        x_train_old = x_train.copy()
        if i>=0.5:
            model=IsolationForest(n_estimators=5000, random_state=np.random.randint(0, 2**31), n_jobs=-1)
            model.fit(np.array(full_X()))
            is_anomaly = model.predict(np.array(x_train))
        else:
            nei_size = np.floor(np.sqrt(full_X().shape[0])).astype(int)
            model=LocalOutlierFactor(n_neighbors = nei_size, leaf_size = nei_size*1.5)
            is_anomaly = model.fit_predict(full_X())[:x_train.shape[0]]
        x_train['anomaly'] = is_anomaly
        y_train['anomaly'] = is_anomaly
        x_train = x_train[x_train.anomaly > 0]
        y_train = y_train[y_train.anomaly > 0]
        x_train = x_train.drop('anomaly',axis=1)
        y_train = y_train.drop('anomaly',axis=1)

0




(1211, 828)
(1212, 828)
1
(1208, 828)
(1211, 828)
CPU times: total: 3min 49s
Wall time: 1min 19s


In [9]:
is_anomaly = np.ones(x_train.shape[0]).astype(bool)
for i in range(2):   
    if i>=0.5:
        model=IsolationForest(n_estimators=5000, contamination = float(0.05), random_state=np.random.randint(0, 2**31), n_jobs=-1)
        model.fit(np.array(full_X()))
        new_anomaly = model.predict(np.array(x_train)) > 0
    else:
        nei_size = np.floor(np.sqrt(full_X().shape[0])).astype(int)
        model=LocalOutlierFactor(n_neighbors = nei_size, contamination = float(0.05), leaf_size = nei_size*1.5)
        new_anomaly = model.fit_predict(full_X())[:x_train.shape[0]] > 0
    is_anomaly = is_anomaly & new_anomaly
x_train['anomaly'] = is_anomaly
y_train['anomaly'] = is_anomaly
x_train = x_train[x_train.anomaly]
y_train = y_train[y_train.anomaly]
x_train = x_train.drop('anomaly',axis=1)
y_train = y_train.drop('anomaly',axis=1)
print(x_train.shape)

(1111, 828)


In [10]:
is_anomaly = np.zeros(x_train.shape[0]).astype(bool)
for i in range(2):   
    if i>=0.5:
        model=IsolationForest(n_estimators=5000, contamination = float(0.1), random_state=np.random.randint(0, 2**31), n_jobs=-1)
        model.fit(np.array(full_X()))
        new_anomaly = model.predict(np.array(x_train)) > 0
    else:
        nei_size = np.floor(np.sqrt(full_X().shape[0])).astype(int)
        model=LocalOutlierFactor(n_neighbors = nei_size, contamination = float(0.1), leaf_size = nei_size*1.5)
        new_anomaly = model.fit_predict(full_X())[:x_train.shape[0]] > 0
    is_anomaly = is_anomaly | new_anomaly
x_train['anomaly'] = is_anomaly
y_train['anomaly'] = is_anomaly
x_train = x_train[x_train.anomaly]
y_train = y_train[y_train.anomaly]
x_train = x_train.drop('anomaly',axis=1)
y_train = y_train.drop('anomaly',axis=1)
print(x_train.shape)

(1087, 828)


In [13]:
x_train.to_csv(preprocessed + 'X_train_preprocessed.csv',index_label="id")
x_test.to_csv(preprocessed + 'X_test_preprocessed.csv',index_label="id")
y_train.to_csv(preprocessed + 'y_train_preprocessed.csv',index_label="id")
y_test.to_csv(preprocessed + 'y_test_preprocessed.csv',index_label="id")

In [None]:
%%time
#auto, Nonan
is_anomaly = np.zeros(x_train.shape[0])
n_attempts = 50
for _ in range(n_attempts):
    model=IsolationForest(n_estimators=150, random_state=np.random.randint(0, 2**31), n_jobs=-1)
    model.fit(np.array(full_X()))
    is_anomaly += model.predict(np.array(x_train))/n_attempts

In [None]:
%%time
#contamination, Nonan
is_anomaly4 = np.zeros(x_train.shape[0])
n_attempts = 50
for _ in range(n_attempts):
    model=IsolationForest(n_estimators=150, max_samples='auto', contamination=float(0.2), random_state=np.random.randint(0, 2**31), n_jobs=-1)
    model.fit(np.array(full_X()))
    is_anomaly4 += model.predict(np.array(x_train))/n_attempts

In [None]:
%%time
#auto, nan
is_anomaly3 = np.zeros(x_train.shape[0])
n_attempts = 50
for _ in range(n_attempts):
    model=IsolationForest(n_estimators=150, random_state=np.random.randint(0, 2**31), n_jobs=-1)
    model.fit(np.array(X_with_na()))
    is_anomaly3 += model.predict(np.array(X_with_na(x_train, nans_train)))/n_attempts

In [None]:
%%time
#contamination, nan
is_anomaly2 = np.zeros(x_train.shape[0])
n_attempts = 50
for _ in range(n_attempts):
    model=IsolationForest(n_estimators=150, max_samples='auto', contamination=float(0.2), random_state=np.random.randint(0, 2**31), n_jobs=-1)
    model.fit(np.array(X_with_na()))
    is_anomaly2 += model.predict(np.array(X_with_na(x_train, nans_train)))/n_attempts

In [None]:
%%time
is_anomaly5 = np.zeros(x_train.shape[0])
n_attempts = 1
for _ in range(n_attempts):
    model=IsolationForest(n_estimators=150, max_samples='auto', contamination=float(0.2), random_state=np.random.randint(0, 2**31), n_jobs=-1)
    model.fit(np.array(x_train))
    is_anomaly5 += model.predict(np.array(x_train))/n_attempts

In [None]:
np.sort(is_anomaly)[:50]

In [None]:
x_train.index[is_anomaly5<=0]

In [None]:
np.sort(is_anomaly5)[-50:]

In [None]:
x_train.index[is_anomaly5<=0]

In [None]:
np.sort(is_anomaly2)[:50]

In [None]:
x_train.index[is_anomaly2<=-0.7]

In [None]:
is_anomaly2[is_anomaly<=0]

In [None]:
np.sort(is_anomaly3)[:50]

In [None]:
x_train.index[is_anomaly3<1]

In [None]:
np.sort(is_anomaly4)[:50]

In [None]:
x_train.index[is_anomaly4<=-0.99]