# Implementation of Everywhere library to create an events recommendation engine


In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
import numpy as np
import pandas as pd
import os
import everywhere as ev
import lightgbm as lgb # Mac users require cmake & libomp to import lightgbm
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from collections import Counter
from dotenv import load_dotenv
from everywhere.datasets.load_data import (
    get_users_data,
    get_events_data,
    get_train_data,
    get_user_friends_data,
    get_event_attendees_data,
)
from everywhere.datasets.feature_extractor import (
    get_friends_attendee_nums,
    get_event_attendee_nums,
)
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier


In [36]:
print(f"Everywhere version: {ev.__version__}")


Everywhere version: 0.0.0


## Data prep


### Load Datasets


In [37]:
load_dotenv()  # load environment variables


True

In [38]:
df_users = get_users_data()
print(df_users.shape)
df_users.head()


Found users in local 🎉
(38209, 7)


Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [39]:
df_user_friends = get_user_friends_data()
print(df_user_friends.shape)
df_user_friends.head()


Found user_friends in local 🎉
(38202, 2)


Unnamed: 0,user,friends
0,3197468391,1346449342 3873244116 4226080662 1222907620 54...
1,3537982273,1491560444 395798035 2036380346 899375619 3534...
2,823183725,1484954627 1950387873 1652977611 4185960823 42...
3,1872223848,83361640 723814682 557944478 1724049724 253059...
4,3429017717,4253303705 2130310957 1838389374 3928735761 71...


In [40]:
df_event_attendees = get_event_attendees_data()
print(df_event_attendees.shape)
df_event_attendees.head()


Found event_attendees in local 🎉
(24144, 5)


Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [41]:
df_train = get_train_data()
print(df_train.shape)
df_train.head()


Found train in local 🎉
(15398, 6)


Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [42]:
df_events = get_events_data()
print(df_events.shape)
df_events.head()


Found events in local 🎉
(3137972, 110)


Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


### Data Engineering


In [43]:
df_train = get_friends_attendee_nums(df_train, df_user_friends, df_event_attendees)
df_train


Unnamed: 0,user,event,invited,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0,862,0,0,0,2
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0,862,1,0,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,0,1,0
15394,4293103086,4084655790,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,0,0,1
15395,4293103086,598708806,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,1,1,3
15396,4293103086,604179853,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,1,1,0


In [44]:
df_train = get_event_attendee_nums(df_train, df_event_attendees)
print(df_train.shape)
df_train.head()


(15398, 19)


Unnamed: 0,user,event,invited_x,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,yes,maybe,invited_y,no,users_yes,users_no,users_maybe,users_invited_count
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
1,1906220044,1918771225,0,2012-10-01 05:08:48.533000+00:00,0,0,930,1,0,0,0,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
2,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
3,1302145719,1502284248,0,2012-09-30 13:31:33.569000+00:00,0,0,2265,0,0,0,6,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
4,3194014105,1502284248,0,2012-10-01 17:20:14.536000+00:00,0,0,619,0,0,0,0,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122


In [45]:
df_events_clean = df_events.drop(
    columns=["user_id", "start_time", "city", "state", "zip", "country", "lat", "lng"]
)
# Merge with events
df_train = pd.merge(
    df_train, df_events_clean, how="inner", left_on="event", right_on="event_id"
)
df_train = df_train.drop(columns=["invited_y", "timestamp", "yes", "maybe", "no"])
df_train


Unnamed: 0,user,event,invited_x,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,3044012,1918771225,0,0,0,862,0,3,0,3,...,0,0,0,0,0,0,0,0,0,2
1,1906220044,1918771225,0,0,0,930,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,3044012,1502284248,0,0,0,862,0,0,0,1,...,0,0,0,0,0,0,0,0,0,24
3,1302145719,1502284248,0,0,0,2265,0,0,0,6,...,0,0,0,0,0,0,0,0,0,24
4,3194014105,1502284248,0,0,0,619,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,0,0,480,0,0,1,0,...,0,0,0,0,0,0,0,0,0,80
15394,4293103086,4084655790,0,0,0,480,1,0,0,1,...,0,0,0,0,0,0,0,0,0,80
15395,4293103086,598708806,0,0,0,480,1,1,1,3,...,0,0,0,0,0,0,0,0,0,60
15396,4293103086,604179853,0,0,0,480,0,1,1,0,...,0,0,0,0,0,0,0,0,0,24


# First Aproach With Balancing Datasets

## Balance dataset - Centroid Aproach


In [46]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
print("Original dataset shape {}".format(Counter(y)))
cc = ClusterCentroids(random_state=42)
X_res, y_res = cc.fit_resample(X, y)
print("Resampled dataset shape {}".format(Counter(y_res)))


Original dataset shape Counter({0: 11267, 1: 4131})




Resampled dataset shape Counter({0: 4131, 1: 4131})


In [47]:
# shuffle the data
final = pd.merge(X_res, y_res, left_index=True, right_index=True)
final = final.iloc[np.random.permutation(final.index)].reset_index(drop=True)
final


Unnamed: 0,user,event,invited_x,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,...,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other,interested
0,1221420614,2278755941,0,0,1345,1,0,0,0,9999,...,2,0,0,0,0,0,0,0,60,1
1,2962725194,1076364848,0,0,1507,3,0,2,16,1144,...,1,0,0,0,0,0,0,0,131,1
2,210296799,213754032,0,0,318,0,0,0,1,17,...,0,0,0,0,0,0,0,0,13,1
3,1841192704,728286874,0,0,840,1,0,0,2,78,...,0,0,0,0,0,0,0,0,10,0
4,1141005979,1269035551,0,0,1342,2,0,2,43,510,...,0,1,0,1,0,0,0,0,36,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8257,2419536569,3512048812,0,0,561,0,0,0,29,23,...,0,0,0,0,0,0,0,0,19,0
8258,110314495,955658889,0,0,1430,2,1,0,18,467,...,0,0,0,0,0,0,0,0,35,0
8259,709445936,581011119,0,0,1541,0,0,0,2,40,...,0,0,0,0,0,0,0,0,1,1
8260,2829320199,4268976285,0,0,765,1,2,0,19,31,...,0,0,0,0,0,0,0,0,191,0


## Model & Evaluations


In [48]:
# split train and test
X = final.drop("interested", axis=1)
y = final["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [49]:
# random forest
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# with probabilities
y_pred_proba = rf.predict_proba(X_test)[:, 1]


In [50]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")


Accuracy: 70.77%
[[1046  311]
 [ 486  884]]
ROC AUC score: 80.45%
Precision: 0.7397489539748954
Recall: 0.6452554744525547
f1: 0.6892787524366472
MAP: 0.6555448840550304
NDCG (normalized discounted cumulative gain) : 0.9411446994877729


### Implementing LGBM with bagging

In [51]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



#### Model Evaluation


In [52]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 80.82%
[[1052  305]
 [ 218 1152]]
ROC AUC score: 88.11%
Precision: 0.790665751544269
Recall: 0.8408759124087591
f1: 0.8149982313406438
MAP: 0.7447931127062244
NDCG (normalized discounted cumulative gain) : 0.9562486282041233


### Implementing LGBM without bagging

In [53]:
# Lightgbm
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]


#### Model Evaluation


In [54]:
# WE NEED TO ADD THOSE TO THE ENVIRONEMENT 
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 80.60%
[[1050  307]
 [ 222 1148]]
ROC AUC score: 87.93%
Precision: 0.7890034364261168
Recall: 0.8379562043795621
f1: 0.8127433628318583
MAP: 0.7425584656441414
NDCG (normalized discounted cumulative gain) : 0.9558480806076763


## Balance Data Set using Random UnderSampler 

In [55]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
print("Original dataset shape {}".format(Counter(y)))
# define the random under sampler with strategy 1 , meaning there will be the same amount of interrested = 1 and 0
rus = RandomUnderSampler(sampling_strategy=1)
# fit and transform the data
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_resampled)))

Original dataset shape Counter({0: 11267, 1: 4131})
Resampled dataset shape Counter({0: 2767, 1: 2767})


In [56]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [57]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 63.44%
[[2369 1349]
 [ 509  855]]
ROC AUC score: 69.25%
Precision: 0.3879310344827586
Recall: 0.6268328445747801
f1: 0.4792600896860987
MAP: 0.3433253321829012
NDCG (normalized discounted cumulative gain) : 0.8503260304471508


## Balanced Data Set using weights on LIGHTGBM

In [58]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


class_weights = len(y_train) / (2 * np.bincount(y_train))
class_weights={0:class_weights[0],1:class_weights[1]}
clf = lgb.LGBMClassifier(class_weight=class_weights, bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [59]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1 : {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 68.28%
[[2711 1007]
 [ 605  759]]
ROC AUC score: 70.35%
Precision: 0.4297848244620612
Recall: 0.5564516129032258
f1 : 0.4849840255591054
MAP: 0.3582020778208628
NDCG (normalized discounted cumulative gain) : 0.85901225098661


#### Data-Set Not Balanced LIGHTGBM

In [60]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


class_weights = len(y_train) / (2 * np.bincount(y_train))

clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [61]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 74.66%
[[3487  231]
 [1057  307]]
ROC AUC score: 71.01%
Precision: 0.570631970260223
Recall: 0.2250733137829912
f1: 0.3228180862250263
MAP: 0.33642300921323914
NDCG (normalized discounted cumulative gain) : 0.8637738190347137
