In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.impute import SimpleImputer


In [None]:
df = pd.read_csv('ml_file.csv')
df


In [None]:
df.groupby('retailer_id').agg({'transaction_id': 'count', 'label': np.mean}).describe()


In [None]:
df.describe()


In [None]:
df.isna().sum()


In [None]:
df_no_nulls = df.dropna(axis = 1).sample(frac=1).reset_index(drop=True)
df_no_nulls.isna().sum()


In [None]:
df_no_nulls


In [None]:
retailer_id_dict = {retailer_id: i for i, retailer_id in enumerate(set(df_no_nulls.retailer_id))}
n = len(retailer_id_dict.keys())
n


In [None]:
df_no_nulls['retailer_id_int'] = df_no_nulls.retailer_id.apply(lambda x: retailer_id_dict[x])
df_no_nulls['retailer_id_int'] 


In [None]:
df_no_nulls['partition'] = np.where(df_no_nulls.retailer_id_int < n * 0.8, 'train', 'test')
df_no_nulls['partition'].value_counts()


In [None]:
drop_cols = ['retailer_id', 'transaction_id', 'label', 'partition', 'retailer_id_int']

train = df_no_nulls[df_no_nulls.partition == 'train']
X_train = train.drop(columns = drop_cols)
y_train = train['label']

test = df_no_nulls[df_no_nulls.partition == 'test']
X_test = test.drop(columns = drop_cols)
y_test = test['label'] 

assert len(y_train) + len(y_test) == df_no_nulls.shape[0]
assert X_train.shape[0] + X_test.shape[0] == df_no_nulls.shape[0]


In [None]:
print(y_train.value_counts())
print(y_test.value_counts())


In [None]:
assert set(train.retailer_id).intersection(set(test.retailer_id)) == set()


In [None]:
wts = 0.068685


In [None]:
rf = RandomForestClassifier(random_state = 17, class_weight = {0: 1 - wts, 1: wts})
rf.fit(X_train, y_train)


In [None]:
feature_imp = pd.DataFrame({'feature': X_train.columns, 'importance': rf.feature_importances_}
                          ).sort_values('importance', ascending = False)
feature_imp


In [None]:
train['yhat'] = rf.predict(X_train)
train['yhat'].value_counts()


In [None]:
train['yhat_prob'] = rf.predict_proba(X_train)[:, 1]
train['yhat_prob'].describe()


In [None]:
sklearn.metrics.roc_auc_score(train['label'], train['yhat'])


In [None]:
sklearn.metrics.roc_auc_score(train['label'], train['yhat_prob'])


In [None]:
test['yhat'] = rf.predict(X_test)
test['yhat'].value_counts()


In [None]:
test['yhat_prob'] = rf.predict_proba(X_test)[:, 1]
test['yhat_prob'].describe()


In [None]:
sklearn.metrics.roc_auc_score(test['label'], test['yhat'])


In [None]:
sklearn.metrics.roc_auc_score(test['label'], test['yhat_prob'])


In [None]:
test


In [None]:
test.groupby('retailer_id').agg({'yhat': sum}).sort_values('yhat', ascending = False).head(10)


In [None]:
test.groupby('retailer_id').agg({'yhat_prob': np.mean}).sort_values('yhat_prob', ascending = False).head(10)


In [None]:
df


In [None]:
df.isna().sum()


In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_train = imp_mean.fit_transform(train.drop(columns = drop_cols))
imputed_train


In [None]:
X_train = imputed_train


In [None]:
rf = RandomForestClassifier(random_state = 17, class_weight = {0: 1 - wts, 1: wts})
rf.fit(X_train, y_train)

In [None]:
train['yhat_imputed'] = rf.predict(X_train)
train['yhat_imputed'].value_counts()


In [None]:
train['yhat_prob_imputed'] = rf.predict_proba(X_train)[:, 1]
train['yhat_prob_imputed'].describe()


In [None]:
sklearn.metrics.roc_auc_score(train.label, train.yhat_imputed)


In [None]:
sklearn.metrics.roc_auc_score(train.label, train.yhat_prob_imputed)


In [None]:
test['yhat_imputed'] = rf.predict(X_test)
test['yhat_imputed'].value_counts()


In [None]:
test['yhat_prob_imputed'] = rf.predict_proba(X_test)[:, 1]
test['yhat_prob_imputed'].describe()


In [None]:
sklearn.metrics.roc_auc_score(test.label, test.yhat_imputed)


In [None]:
sklearn.metrics.roc_auc_score(test.label, test.yhat_prob_imputed)
