In [1]:
import numpy as np
import pandas as pd
from late_classifier.features.preprocess import DetectionsPreprocessorZTF
from late_classifier.features.extractors import HierarchicalFeaturesComputer
from late_classifier.classifier.models import BaselineRandomForest
from late_classifier.classifier.metrics import kaggle_score, confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

## Load and inspect the training set

In [2]:
train_labels = pd.read_csv(
    'train_labels.csv', sep=',', na_values=['Infinity']
)
train_labels.set_index('challenge_oid', inplace=True)
drop_cols = ["Mean_1","Mean_2","delta_mag_fid_1","delta_mag_fid_2","n_det_1","n_det_2","n_neg_1","n_neg_2", "n_pos_1", "n_pos_2", "mean_mag_1","mean_mag_2"]

In [3]:
features = pd.read_pickle('train_features.pkl')
wise = pd.read_csv("WISE_data_4_hackathon_with_IDs.csv")
features = features.join(wise.loc[features.index])
features.drop(columns=drop_cols,inplace=True)

In [4]:
train_labels = train_labels.loc[features.index.values]#.copy()
features = features.loc[train_labels.index.values]#.copy()

In [5]:
scaler = MinMaxScaler()
features = pd.DataFrame(scaler.fit_transform(features), index=features.index, columns=features.columns)



In [12]:
features = features+1
features = features.replace(np.nan,-1)

In [13]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(features,train_labels)



In [14]:
test_features = pd.read_pickle("test_features.pkl")
test_features = test_features.join(wise.loc[test_features.index])
test_features.drop(columns=drop_cols,inplace=True)

test_features = pd.DataFrame(scaler.transform(test_features), index=test_features.index, columns=test_features.columns)

## Train random forest

In [15]:
X_resampled = pd.DataFrame(X_resampled,columns=features.columns)
y_resampled = pd.DataFrame(y_resampled,columns=train_labels.columns)

In [17]:
baseline_random_forest = BaselineRandomForest()
baseline_random_forest.random_forest_classifier.max_depth=10
baseline_random_forest.random_forest_classifier.n_jobs=-1
baseline_random_forest.fit(X_resampled, y_resampled)
train_prediction_proba = baseline_random_forest.predict_proba(features)
kaggle_score_value = kaggle_score(train_prediction_proba, train_labels)

In [18]:
kaggle_score_value

0.566277116984665

In [20]:
test_probs = baseline_random_forest.predict_proba(test_features)

In [23]:
test_probs.index.name = "challenge_oid"

In [25]:
test_probs["Oulier"] = 1/15

In [26]:
test_probs.to_csv("smote_rf.csv")