In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('../ub_dataset.csv')
df.head()

## Numerical Transformations Testing

In [None]:
columns_fix = ['continent', 'country_code', 'city', 'region_code']
for col in columns_fix:
    df[col].replace(-1, np.nan, inplace = True)

In [None]:
df_num = df[["place_within_tenant", "city", "region_code"]]

In [None]:
for col in df_num.columns:
    # the min max scaler requires a vector
    transformer = MinMaxScaler().fit(df_num[col].values.reshape(-1, 1)) # single feature
    transformed_data = transformer.transform(df_num[col].values.reshape(-1, 1))
    df_num[col+"_mm"] = transformed_data[:,0]

In [None]:
df_num.head()

In [None]:
df_num.tail()

In [None]:
df_num.loc[df_num['place_within_tenant'] > 1,:]

In [None]:
df_num.describe()

# Scoring

In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
model = joblib.load('../final_model.joblib')

In [None]:
df = pd.read_pickle("../training_df")

In [None]:
def get_train_test_sets(df, imbalance_fix):
    X = df.drop(['label'], axis=1)
    y = df['label']  # Labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # 70% training and 30% test

    if imbalance_fix == 'down':
        print("Before undersampling: ", Counter(y_train))
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
        print("After undersampling: ", Counter(y_train_under))
        return X_train_under, X_test, y_train_under, y_test

    if imbalance_fix == 'up':
        print("Before undersampling: ", Counter(y_train))
        smote = SMOTE()
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        print("After undersampling: ", Counter(y_train_smote))
        return X_train_smote, X_test, y_train_smote, y_test

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_sets(df, 'down')

In [None]:
result = model.predict_proba(X_test)

In [None]:
result[0][1]

In [None]:
model.predict(X_test)