In [6]:
import featuretools as ft
from dask import bag
from dask.diagnostics import ProgressBar
import pandas as pd
import utilsLoad
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import os

'0.23.4'

## 部分数据

In [None]:
es = utilsLoad.load_entityset("data/")

In [None]:
es.plot()

In [None]:
label_times = utilsLoad.make_labels(es=es,
                                product_name = "Banana",
                                cutoff_time = pd.Timestamp('March 15, 2015'),
                                prediction_window = ft.Timedelta("4 weeks"),
                                training_window = ft.Timedelta("60 days"))
label_times.head(5)

In [None]:
label_times["label"].value_counts()

In [None]:
feature_matrix, features = ft.dfs(target_entity="users", 
                                  cutoff_time=label_times,
                                  training_window=ft.Timedelta("60 days"), # same as above
                                  entityset=es,
                                  verbose=True)
# encode categorical values
fm_encoded, features_encoded = ft.encode_features(feature_matrix,
                                                  features)

print("Number of features %s" % len(features_encoded))
fm_encoded.head(10)

In [None]:
X = utils.merge_features_labels(fm_encoded, label_times)
X.drop(["user_id", "time"], axis=1, inplace=True)
X = X.fillna(0)
y = X.pop("label")

In [None]:
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)
scores = cross_val_score(estimator=clf,X=X, y=y, cv=3,
                         scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

In [None]:
clf.fit(X, y)
top_features = utils.feature_importances(clf, features_encoded, n=20)

In [None]:
ft.save_features(top_features, "top_features")

## 全部数据 dask或spark

In [None]:
pbar = ProgressBar()
pbar.register()

In [None]:
path = "partitioned_data/"
dirnames = [os.path.join(path, d) for d in os.listdir(path)]
b = bag.from_sequence(dirnames)
entity_sets = b.map(utils.load_entityset)

In [None]:
label_times = entity_sets.map(utils.dask_make_labels,
                              product_name = "Banana",
                              cutoff_time = pd.Timestamp('March 1, 2015'),
                              prediction_window = ft.Timedelta("4 weeks"),
                              training_window = ft.Timedelta("60 days"))
label_times

In [None]:
top_features = ft.load_features("top_features")
feature_matrices = label_times.map(utils.calculate_feature_matrix, features=top_features)

In [None]:
fms_out = feature_matrices.compute()
X = pd.concat(fms_out)

In [None]:
X.drop(["user_id", "time"], axis=1, inplace=True)
X = X.fillna(0)
y = X.pop("label")

In [None]:
clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)
scores = cross_val_score(estimator=clf,X=X, y=y, cv=3,
                         scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

In [None]:
clf.fit(X, y)
top_features = utils.feature_importances(clf, top_features, n=20)