In [None]:
# Databricks notebook source

# Implementation of Everywhere library to create an events recommendation engine


In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np
import pandas as pd
import random
import os
import everywhere as ev
import lightgbm as lgb # Mac users require cmake & libomp to import lightgbm
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from collections import Counter
from dotenv import load_dotenv
from everywhere.datasets.load_data import (
    get_users_data,
    get_events_data,
    get_train_data,
    get_user_friends_data,
    get_event_attendees_data,
)
from everywhere.datasets.feature_extractor import (
    get_friends_attendee_nums,
    get_event_attendee_nums,
)
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier


In [2]:
print(f"Everywhere version: {ev.__version__}")


Everywhere version: 0.0.0


## Data prep


### Load Datasets


In [3]:
load_dotenv()  # load environment variables


True

In [4]:
df_users = get_users_data()
print(df_users.shape)
df_users.head()


Found users in local 🎉
(38209, 7)


Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [5]:
df_user_friends = get_user_friends_data()
print(df_user_friends.shape)
df_user_friends.head()


Found user_friends in local 🎉
(38202, 2)


Unnamed: 0,user,friends
0,3197468391,1346449342 3873244116 4226080662 1222907620 54...
1,3537982273,1491560444 395798035 2036380346 899375619 3534...
2,823183725,1484954627 1950387873 1652977611 4185960823 42...
3,1872223848,83361640 723814682 557944478 1724049724 253059...
4,3429017717,4253303705 2130310957 1838389374 3928735761 71...


In [6]:
df_event_attendees = get_event_attendees_data()
print(df_event_attendees.shape)
df_event_attendees.head()


Found event_attendees in local 🎉
(24144, 5)


Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [7]:
df_train = get_train_data()
print(df_train.shape)
df_train.head()


Found train in local 🎉
(15398, 6)


Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [8]:
df_events = get_events_data()
print(df_events.shape)
df_events.head()


Found events in local 🎉
(3137972, 110)


Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


### Data Engineering


In [9]:
df_train = get_friends_attendee_nums(df_train, df_user_friends, df_event_attendees)
df_train


Unnamed: 0,user,event,invited,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0,862,0,0,0,2
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0,862,1,0,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,0,1,0
15394,4293103086,4084655790,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,0,0,1
15395,4293103086,598708806,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,1,1,3
15396,4293103086,604179853,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,1,1,0


In [10]:
df_train = get_event_attendee_nums(df_train, df_event_attendees)
print(df_train.shape)
df_train.head()


(15398, 19)


Unnamed: 0,user,event,invited_x,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,yes,maybe,invited_y,no,users_yes,users_no,users_maybe,users_invited_count
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
1,1906220044,1918771225,0,2012-10-01 05:08:48.533000+00:00,0,0,930,1,0,0,0,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
2,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
3,1302145719,1502284248,0,2012-09-30 13:31:33.569000+00:00,0,0,2265,0,0,0,6,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
4,3194014105,1502284248,0,2012-10-01 17:20:14.536000+00:00,0,0,619,0,0,0,0,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122


In [11]:
df_events_clean = df_events.drop(
    columns=["user_id", "start_time", "city", "state", "zip", "country", "lat", "lng"]
)
# Merge with events
df_train = pd.merge(
    df_train, df_events_clean, how="inner", left_on="event", right_on="event_id"
)
df_train = df_train.drop(columns=["invited_y", "timestamp", "yes", "maybe", "no", "not_interested"])
df_train


Unnamed: 0,user,event,invited_x,interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,3044012,1918771225,0,0,862,0,3,0,3,8,...,0,0,0,0,0,0,0,0,0,2
1,1906220044,1918771225,0,0,930,1,0,0,0,8,...,0,0,0,0,0,0,0,0,0,2
2,3044012,1502284248,0,0,862,0,0,0,1,10,...,0,0,0,0,0,0,0,0,0,24
3,1302145719,1502284248,0,0,2265,0,0,0,6,10,...,0,0,0,0,0,0,0,0,0,24
4,3194014105,1502284248,0,0,619,0,0,0,0,10,...,0,0,0,0,0,0,0,0,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,0,480,0,0,1,0,111,...,0,0,0,0,0,0,0,0,0,80
15394,4293103086,4084655790,0,0,480,1,0,0,1,37,...,0,0,0,0,0,0,0,0,0,80
15395,4293103086,598708806,0,0,480,1,1,1,3,181,...,0,0,0,0,0,0,0,0,0,60
15396,4293103086,604179853,0,0,480,0,1,1,0,436,...,0,0,0,0,0,0,0,0,0,24


# First Aproach With Balancing Datasets

## Balance dataset - Centroid Aproach


In [226]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
print("Original dataset shape {}".format(Counter(y)))
cc = ClusterCentroids(random_state=42)
X_res, y_res = cc.fit_resample(X, y)
print("Resampled dataset shape {}".format(Counter(y_res)))


Original dataset shape Counter({0: 11267, 1: 4131})




KeyboardInterrupt: 

In [None]:
# shuffle the data
final = pd.merge(X_res, y_res, left_index=True, right_index=True)
final = final.iloc[np.random.permutation(final.index)].reset_index(drop=True)
final


Unnamed: 0,user,event,invited_x,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,...,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other,interested
0,4024053152,910169576,0,0,1482,2,0,1,42,92,...,0,0,0,0,0,0,0,0,50,1
1,1959680039,1151885141,0,0,397,8,0,0,6,19,...,0,0,0,0,0,0,0,0,11,0
2,4024104633,1509550896,0,0,6,0,0,0,1,18,...,0,1,0,0,0,1,0,1,43,1
3,3594731327,1195636083,0,0,2484,1,0,0,0,49,...,0,0,0,0,0,0,0,0,78,1
4,1374205441,4002798045,0,0,96,0,0,1,6,451,...,0,0,0,0,0,0,0,0,48,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8257,4199616576,955398943,0,0,1534,0,0,0,4,621,...,0,0,1,0,0,0,0,0,37,1
8258,3200119975,2453993629,0,0,786,0,0,0,1,73,...,0,0,0,0,1,0,0,0,223,1
8259,2081393559,1896310801,0,0,816,0,0,0,1,26,...,0,0,0,0,0,0,0,0,9,1
8260,1553019839,2615984474,0,0,3684,2,1,2,18,48,...,0,0,0,0,0,0,0,1,187,1


## Model & Evaluations


In [12]:
#without balance
final= df_train.copy()

In [13]:
# split train and test
X = final.drop("interested", axis=1)
y = final["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [14]:
# random forest
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# with probabilities
y_pred_proba = rf.predict_proba(X_test)[:, 1]


In [31]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")


Accuracy: 73.59%
[[3702   16]
 [1326   38]]
ROC AUC score: 64.82%
Precision: 0.7037037037037037
Recall: 0.02785923753665689
f1: 0.053596614950634704
MAP: 0.28052554592144036
NDCG (normalized discounted cumulative gain) : 0.8364984052303476


### Implementing LGBM with bagging

In [69]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



#### Model Evaluation


In [70]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 82.25%
[[1100  241]
 [ 243 1143]]
ROC AUC score: 89.30%
Precision: 0.8258670520231214
Recall: 0.8246753246753247
f1: 0.8252707581227438
MAP: 0.7701810901569099
NDCG (normalized discounted cumulative gain) : 0.9625357875986742


### Implementing LGBM without bagging

In [53]:
# Lightgbm
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]


#### Model Evaluation


In [54]:
# WE NEED TO ADD THOSE TO THE ENVIRONEMENT 
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 80.60%
[[1050  307]
 [ 222 1148]]
ROC AUC score: 87.93%
Precision: 0.7890034364261168
Recall: 0.8379562043795621
f1: 0.8127433628318583
MAP: 0.7425584656441414
NDCG (normalized discounted cumulative gain) : 0.9558480806076763


## Balance Data Set using Random UnderSampler 

In [55]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
print("Original dataset shape {}".format(Counter(y)))
# define the random under sampler with strategy 1 , meaning there will be the same amount of interrested = 1 and 0
rus = RandomUnderSampler(sampling_strategy=1)
# fit and transform the data
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_resampled)))

Original dataset shape Counter({0: 11267, 1: 4131})
Resampled dataset shape Counter({0: 2767, 1: 2767})


In [56]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [57]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 63.44%
[[2369 1349]
 [ 509  855]]
ROC AUC score: 69.25%
Precision: 0.3879310344827586
Recall: 0.6268328445747801
f1: 0.4792600896860987
MAP: 0.3433253321829012
NDCG (normalized discounted cumulative gain) : 0.8503260304471508


## Balanced Data Set using weights on LIGHTGBM

In [58]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


class_weights = len(y_train) / (2 * np.bincount(y_train))
class_weights={0:class_weights[0],1:class_weights[1]}
clf = lgb.LGBMClassifier(class_weight=class_weights, bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [59]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1 : {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 68.28%
[[2711 1007]
 [ 605  759]]
ROC AUC score: 70.35%
Precision: 0.4297848244620612
Recall: 0.5564516129032258
f1 : 0.4849840255591054
MAP: 0.3582020778208628
NDCG (normalized discounted cumulative gain) : 0.85901225098661


#### Data-Set Not Balanced LIGHTGBM

In [60]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


class_weights = len(y_train) / (2 * np.bincount(y_train))

clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [61]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 74.66%
[[3487  231]
 [1057  307]]
ROC AUC score: 71.01%
Precision: 0.570631970260223
Recall: 0.2250733137829912
f1: 0.3228180862250263
MAP: 0.33642300921323914
NDCG (normalized discounted cumulative gain) : 0.8637738190347137


# Recommendation

## Option 1

Take the events in the test data( X_test) set as the future events on which we will make recommendations.

In [15]:
futureEvents = X_test.copy()
# add the predictions to the dataframe as probabilities
futureEvents["predictions"] = y_pred_proba
# take the info we are interested in
futureEvents= futureEvents[["user", "event" ,"predictions"]]


In [16]:
#a list of the users that are in that df
userEventList= futureEvents.user.unique()

In [17]:
#take a random user 
import random
user = random.choice(userEventList)
futureEvents[futureEvents.user == user].sort_values(by="predictions", ascending=False)

Unnamed: 0,user,event,predictions
41,605903903,2529072432,0.468848
6693,605903903,1872758613,0.278818
6692,605903903,30650936,0.209585


## Option 2

Use (test.csv + X_test) as the future events

In [18]:
test= pd.read_csv("../local_data/test.csv")
print(test.shape)
test.head()

(10237, 4)


Unnamed: 0,user,event,invited,timestamp
0,1776192,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,1776192,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,1776192,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,1776192,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,1776192,2972428928,0,2012-11-30 11:39:21.985000+00:00


In [19]:
#change the users in test to the ones in futureEvents
test.user = test.user.apply(lambda x: x if x in userEventList else random.choice(userEventList))
test.head()

Unnamed: 0,user,event,invited,timestamp
0,3131909049,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,2673355078,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,3377526627,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,2378242378,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,121528378,2972428928,0,2012-11-30 11:39:21.985000+00:00


In [20]:
df_FE = get_friends_attendee_nums(test, df_user_friends, df_event_attendees)
df_FE

Unnamed: 0,user,event,invited,timestamp,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited
0,3131909049,2877501688,0,2012-11-30 11:39:01.230000+00:00,885,0,0,0,0
1,3131909049,2887226822,0,2012-10-31 23:45:24.221000+00:00,885,0,0,0,0
2,2673355078,3025444328,0,2012-11-30 11:39:01.230000+00:00,899,0,0,0,0
3,2673355078,2828969498,0,2012-10-29 05:37:06.773000+00:00,899,0,0,0,0
4,2673355078,787628032,1,2012-11-08 21:15:34.761000+00:00,899,0,0,0,0
...,...,...,...,...,...,...,...,...,...
10232,3833491880,771676713,0,2012-10-23 21:53:15.318000+00:00,1009,1,0,0,9
10233,3833491880,1704179171,0,2012-10-27 11:26:31.303000+00:00,1009,0,0,0,0
10234,1176550521,695330828,0,2012-11-10 07:06:25.899000+00:00,2206,0,0,0,0
10235,2006663136,1568177677,0,2012-10-30 12:27:56.101000+00:00,339,0,0,0,0


In [21]:
df_FE = get_event_attendee_nums(df_FE, df_event_attendees)
print(df_FE.shape)
df_FE.head()

(10237, 17)


Unnamed: 0,user,event,invited_x,timestamp,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,yes,maybe,invited_y,no,users_yes,users_no,users_maybe,users_invited_count
0,3131909049,2877501688,0,2012-11-30 11:39:01.230000+00:00,885,0,0,0,0,2532906069 4229871354 1047349682 2489056284 33...,357306026 3230138404 39482604 4061350910 42551...,2808011777 2590224671 3645689275 3559244521 14...,,61,0,19,27
1,3131909049,2887226822,0,2012-10-31 23:45:24.221000+00:00,885,0,0,0,0,3701190247 1766723247 293422467 2159959540 191...,3796947225 678183115 103079719 492817173 35323...,4251775106 161899142 2283109883 1531855342 230...,4254586212,31,1,36,94
2,2673355078,3025444328,0,2012-11-30 11:39:01.230000+00:00,899,0,0,0,0,1395337575 1656012408 928666438 688643300 2948...,2654039819 283086056 38342381 3595068697 14494...,1837706333 3389983109 1483668942 1680181747 42...,530974838 1288190384 3062558961 3466028827 119...,18,5,10,71
3,3508414044,3025444328,0,2012-11-29 17:40:27.434000+00:00,1565,0,0,0,0,1395337575 1656012408 928666438 688643300 2948...,2654039819 283086056 38342381 3595068697 14494...,1837706333 3389983109 1483668942 1680181747 42...,530974838 1288190384 3062558961 3466028827 119...,18,5,10,71
4,2673355078,2828969498,0,2012-10-29 05:37:06.773000+00:00,899,0,0,0,0,1763718487 3579810243 2216213181 631145387 346...,1102819221 1012620247 2129002889 372578279 140...,3401393786 2080573470 4067405875 112741544 392...,2328516114 4038558739 2927143650 1432169815 35...,151,24,94,870


In [22]:
df_events_clean = df_events.drop(
    columns=["user_id", "start_time", "city", "state", "zip", "country", "lat", "lng"]
)
# Merge with events
df_FE = pd.merge(
    df_FE, df_events_clean, how="inner", left_on="event", right_on="event_id"
)
df_FE = df_FE.drop(columns=["invited_y", "timestamp", "yes", "maybe", "no"])
df_FE

Unnamed: 0,user,event,invited_x,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,users_no,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,3131909049,2877501688,0,885,0,0,0,0,61,0,...,0,0,0,0,0,0,0,0,0,15
1,3131909049,2887226822,0,885,0,0,0,0,31,1,...,0,0,0,0,0,0,0,0,0,136
2,2673355078,3025444328,0,899,0,0,0,0,18,5,...,0,0,0,0,0,0,0,0,0,51
3,3508414044,3025444328,0,1565,0,0,0,0,18,5,...,0,0,0,0,0,0,0,0,0,51
4,2673355078,2828969498,0,899,0,0,0,0,151,24,...,0,0,1,0,0,0,0,0,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10232,2129690960,2346511460,0,492,0,0,0,0,110,110,...,0,0,0,0,0,0,0,0,0,78
10233,3442212683,3192741670,0,674,0,0,0,0,7,0,...,0,0,0,0,0,0,0,0,0,10
10234,3833491880,739312213,0,1009,0,0,0,0,40,5,...,0,0,0,0,0,0,0,0,0,46
10235,2006663136,1568177677,0,339,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,5


In [23]:
#concat X_test with df_FE
all_future_events = pd.concat([X_test, df_FE], axis=0)
print(all_future_events.shape)

(15319, 114)


In [24]:
#predict the probabilities of all_future_events
y_pred_proba = rf.predict_proba(all_future_events)[:, 1]
all_future_events["predictions"] = y_pred_proba
all_future_events.head()


Unnamed: 0,user,event,invited_x,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,users_no,...,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other,predictions
411,1749528606,152418051,0,1476,25,22,23,182,842,596,...,0,0,0,0,0,0,0,0,111,0.368376
5076,282487230,571271773,0,526,0,0,0,2,22,13,...,0,0,1,0,0,0,0,0,53,0.267041
8146,1053832375,3728112255,0,1013,0,0,0,1,9,0,...,0,2,0,0,0,0,0,0,5,0.208119
8304,1075126714,2091251917,0,729,0,4,0,85,18,29,...,0,0,0,0,0,0,0,0,66,0.239284
4147,1737346733,2290279491,0,1079,0,1,0,2,163,55,...,0,0,0,0,0,0,0,0,4,0.293526


In [25]:
futureEvents= all_future_events[[ "user", "event" ,"predictions"]]
futureEvents.head()

Unnamed: 0,user,event,predictions
411,1749528606,152418051,0.368376
5076,282487230,571271773,0.267041
8146,1053832375,3728112255,0.208119
8304,1075126714,2091251917,0.239284
4147,1737346733,2290279491,0.293526


In [26]:
#take a random user and sort the events by the predictions
user = random.choice(userEventList)
futureEvents[futureEvents.user == user].sort_values(by="predictions", ascending=False)

Unnamed: 0,user,event,predictions
31,492612031,2529072432,0.468668
801,492612031,2529072432,0.468668
1110,492612031,861118590,0.33572
8575,492612031,619118259,0.306338
6481,492612031,2205661933,0.295697
6248,492612031,126225732,0.280153
6247,492612031,2799591418,0.279474
8570,492612031,4010465283,0.263412
8574,492612031,1394896327,0.2502
8571,492612031,1064885201,0.244881


In [45]:
#change the format of the df
df= futureEvents.copy()
df
# new column which will be a tuple of the event and the probability
df["event_prob"] = df.apply(lambda x: (x["event"], x["predictions"]), axis=1)
# group by user and take the list of the events and probabilities
df = df.groupby("user")["event_prob"].apply(list).reset_index()
df

Unnamed: 0,user,event_prob
0,3044012,"[(1532377761.0, 0.3146773321134172), (19187712..."
1,4236494,"[(4203627753.0, 0.32715005030247246), (7997824..."
2,5574997,"[(1423412400.0, 0.20541125871692062), (1212947..."
3,7547671,"[(3428807031.0, 0.2623703194477715), (15008080..."
4,10329108,"[(40254649.0, 0.2714617326945659), (1287318780..."
...,...,...
1894,4282954307,"[(610427555.0, 0.26428528580519917), (29853017..."
1895,4286635694,"[(1868620616.0, 0.20851395827055985), (3162029..."
1896,4288238269,"[(3783800124.0, 0.23992691700278806), (1347934..."
1897,4291083982,"[(2072616125.0, 0.3491065453863136), (40407649..."
