# Implementation of Everywhere library to create an events recommendation engine


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import os
import everywhere as ev
import lightgbm as lgb # Mac users require cmake & libomp to import lightgbm
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from collections import Counter
from dotenv import load_dotenv
from everywhere.datasets.load_data import (
    get_users_data,
    get_events_data,
    get_train_data,
    get_user_friends_data,
    get_event_attendees_data,
)
from everywhere.datasets.feature_extractor import (
    get_friends_attendee_nums,
    get_event_attendee_nums,
)
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
import random

In [3]:
print(f"Everywhere version: {ev.__version__}")


Everywhere version: 0.0.0


## Data prep


### Load Datasets


In [4]:
load_dotenv()  # load environment variables


True

In [5]:
df_users = get_users_data()
print(df_users.shape)
df_users.head()


Found users in local 🎉
(38209, 7)


Unnamed: 0,user_id,locale,birthyear,gender,joinedAt,location,timezone
0,3197468391,id_ID,1993,male,2012-10-02T06:40:55.524Z,Medan Indonesia,480.0
1,3537982273,id_ID,1992,male,2012-09-29T18:03:12.111Z,Medan Indonesia,420.0
2,823183725,en_US,1975,male,2012-10-06T03:14:07.149Z,Stratford Ontario,-240.0
3,1872223848,en_US,1991,female,2012-11-04T08:59:43.783Z,Tehran Iran,210.0
4,3429017717,id_ID,1995,female,2012-09-10T16:06:53.132Z,,420.0


In [6]:
df_user_friends = get_user_friends_data()
print(df_user_friends.shape)
df_user_friends.head()


Found user_friends in local 🎉
(38202, 2)


Unnamed: 0,user,friends
0,3197468391,1346449342 3873244116 4226080662 1222907620 54...
1,3537982273,1491560444 395798035 2036380346 899375619 3534...
2,823183725,1484954627 1950387873 1652977611 4185960823 42...
3,1872223848,83361640 723814682 557944478 1724049724 253059...
4,3429017717,4253303705 2130310957 1838389374 3928735761 71...


In [7]:
df_event_attendees = get_event_attendees_data()
print(df_event_attendees.shape)
df_event_attendees.head()


Found event_attendees in local 🎉
(24144, 5)


Unnamed: 0,event,yes,maybe,invited,no
0,1159822043,1975964455 252302513 4226086795 3805886383 142...,2733420590 517546982 1350834692 532087573 5831...,1723091036 3795873583 4109144917 3560622906 31...,3575574655 1077296663
1,686467261,2394228942 2686116898 1056558062 3792942231 41...,1498184352 645689144 3770076778 331335845 4239...,1788073374 733302094 1830571649 676508092 7081...,
2,1186208412,,3320380166 3810793697,1379121209 440668682,1728988561 2950720854
3,2621578336,,,,
4,855842686,2406118796 3550897984 294255260 1125817077 109...,2671721559 1761448345 2356975806 2666669465 10...,1518670705 880919237 2326414227 2673818347 332...,3500235232


In [8]:
df_train = get_train_data()
print(df_train.shape)
df_train.head()


Found train in local 🎉
(15398, 6)


Unnamed: 0,user,event,invited,timestamp,interested,not_interested
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0


In [9]:
df_events = get_events_data()
print(df_events.shape)
df_events.head()


Found events in local 🎉
(3137972, 110)


Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9


### Data Engineering


In [10]:
df_train = get_friends_attendee_nums(df_train, df_user_friends, df_event_attendees)
df_train


Unnamed: 0,user,event,invited,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3
1,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1
2,3044012,2529072432,0,2012-10-02 15:53:05.754000+00:00,1,0,862,0,0,0,2
3,3044012,3072478280,0,2012-10-02 15:53:05.754000+00:00,0,0,862,1,0,0,0
4,3044012,1390707377,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,0,1,0
15394,4293103086,4084655790,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,0,0,1
15395,4293103086,598708806,0,2012-12-08 03:59:43.169000+00:00,0,0,480,1,1,1,3
15396,4293103086,604179853,0,2012-12-08 03:59:43.169000+00:00,0,0,480,0,1,1,0


In [11]:
df_train = get_event_attendee_nums(df_train, df_event_attendees)
print(df_train.shape)
df_train.head()


(15398, 19)


Unnamed: 0,user,event,invited_x,timestamp,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,yes,maybe,invited_y,no,users_yes,users_no,users_maybe,users_invited_count
0,3044012,1918771225,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,3,0,3,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
1,1906220044,1918771225,0,2012-10-01 05:08:48.533000+00:00,0,0,930,1,0,0,0,671846165 3400950561 3622121962 2914089126 427...,3777634824 707808605,243449741 2705495661 2053207729 2228554219 229...,3648238803 3898796217 2892558795 2522528816 65...,8,25,2,23
2,3044012,1502284248,0,2012-10-02 15:53:05.754000+00:00,0,0,862,0,0,0,1,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
3,1302145719,1502284248,0,2012-09-30 13:31:33.569000+00:00,0,0,2265,0,0,0,6,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122
4,3194014105,1502284248,0,2012-10-01 17:20:14.536000+00:00,0,0,619,0,0,0,0,2897626992 3265030005 593212332 1566080870 171...,4221735584 821549713 1573381076 939778463 4110...,2823348035 1461073321 1603391812 1511769431 35...,3928943237 1419223904,10,2,6,122


In [12]:
df_events_clean = df_events.drop(
    columns=["user_id", "start_time", "city", "state", "zip", "country", "lat", "lng"]
)
# Merge with events
df_train = pd.merge(
    df_train, df_events_clean, how="inner", left_on="event", right_on="event_id"
)
df_train = df_train.drop(columns=["invited_y", "timestamp", "yes", "maybe", "no"])
df_train


Unnamed: 0,user,event,invited_x,interested,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,3044012,1918771225,0,0,0,862,0,3,0,3,...,0,0,0,0,0,0,0,0,0,2
1,1906220044,1918771225,0,0,0,930,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,3044012,1502284248,0,0,0,862,0,0,0,1,...,0,0,0,0,0,0,0,0,0,24
3,1302145719,1502284248,0,0,0,2265,0,0,0,6,...,0,0,0,0,0,0,0,0,0,24
4,3194014105,1502284248,0,0,0,619,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15393,4293103086,2750873665,0,0,0,480,0,0,1,0,...,0,0,0,0,0,0,0,0,0,80
15394,4293103086,4084655790,0,0,0,480,1,0,0,1,...,0,0,0,0,0,0,0,0,0,80
15395,4293103086,598708806,0,0,0,480,1,1,1,3,...,0,0,0,0,0,0,0,0,0,60
15396,4293103086,604179853,0,0,0,480,0,1,1,0,...,0,0,0,0,0,0,0,0,0,24


# First Aproach With Balancing Datasets

## Balance dataset - Centroid Aproach


In [13]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
print("Original dataset shape {}".format(Counter(y)))
cc = ClusterCentroids(random_state=42)
X_res, y_res = cc.fit_resample(X, y)
print("Resampled dataset shape {}".format(Counter(y_res)))


Original dataset shape Counter({0: 11267, 1: 4131})




Resampled dataset shape Counter({0: 4131, 1: 4131})


In [14]:
# shuffle the data
final = pd.merge(X_res, y_res, left_index=True, right_index=True)
final = final.iloc[np.random.permutation(final.index)].reset_index(drop=True)
final


Unnamed: 0,user,event,invited_x,not_interested,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,...,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other,interested
0,2706298619,1106103765,0,0,194,0,0,0,1,80,...,0,0,0,0,0,0,0,1,3,1
1,515371126,3169908764,0,0,1262,0,0,0,0,16,...,0,0,0,0,0,0,0,0,70,0
2,3108480638,207321472,0,0,485,0,0,0,1,41,...,0,0,0,0,0,0,0,0,26,0
3,528289771,2713053749,0,0,1764,0,0,1,107,41,...,0,0,0,0,0,0,0,0,38,1
4,311065206,268233790,0,0,712,0,0,0,10,99,...,0,0,0,0,0,0,0,0,30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8257,4011226528,518307713,0,0,1195,0,0,0,11,129,...,0,0,0,0,0,0,0,0,89,0
8258,2501052021,1845894044,0,0,2462,0,0,0,1,64,...,0,0,0,0,0,0,0,0,45,1
8259,426298744,3309338938,0,0,687,0,0,0,10,47,...,0,0,0,0,0,0,0,0,73,0
8260,3037662592,1603128544,0,0,173,0,0,0,0,6,...,0,1,0,0,0,0,0,0,10,0


## Model & Evaluations


In [15]:
# split train and test
X = final.drop("interested", axis=1)
y = final["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [16]:
# random forest
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# with probabilities
y_pred_proba = rf.predict_proba(X_test)[:, 1]


In [17]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))

precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")


Accuracy: 74.26%
[[1092  277]
 [ 425  933]]
ROC AUC score: 83.47%
Precision: 0.7710743801652893
Recall: 0.687039764359352
f1: 0.7266355140186916
MAP: 0.6856076786774497
NDCG (normalized discounted cumulative gain) : 0.9477744992609971


### Implementing LGBM with bagging

In [18]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



#### Model Evaluation


In [19]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 81.19%
[[1087  282]
 [ 231 1127]]
ROC AUC score: 88.85%
Precision: 0.7998580553584103
Recall: 0.8298969072164949
f1: 0.8146006505240332
MAP: 0.7485081972012293
NDCG (normalized discounted cumulative gain) : 0.9575181452091592


### Implementing LGBM without bagging

In [20]:
# Lightgbm
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]


#### Model Evaluation


In [21]:
# WE NEED TO ADD THOSE TO THE ENVIRONEMENT 
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 80.71%
[[1065  304]
 [ 222 1136]]
ROC AUC score: 89.08%
Precision: 0.7888888888888889
Recall: 0.8365243004418262
f1: 0.8120085775553967
MAP: 0.7413328667181888
NDCG (normalized discounted cumulative gain) : 0.955635926872522


### Try Without Balancing

In [22]:
final= df_train.copy()

In [23]:
# split train and test
X = final.drop("interested", axis=1)
y = final["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

### Implementing LGBM with bagging

In [24]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [25]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 74.66%
[[3487  231]
 [1057  307]]
ROC AUC score: 71.01%
Precision: 0.570631970260223
Recall: 0.2250733137829912
f1: 0.3228180862250263
MAP: 0.33642300921323914
NDCG (normalized discounted cumulative gain) : 0.8637738190347137


### Implementing LGBM without bagging

In [26]:
# Lightgbm
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]


In [27]:
# WE NEED TO ADD THOSE TO THE ENVIRONEMENT 
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 74.56%
[[3529  189]
 [1104  260]]
ROC AUC score: 71.35%
Precision: 0.579064587973274
Recall: 0.1906158357771261
f1: 0.2868174296745726
MAP: 0.3276161885518618
NDCG (normalized discounted cumulative gain) : 0.8609187614667433


## Balance Data Set using Random UnderSampler 

In [28]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
print("Original dataset shape {}".format(Counter(y)))
# define the random under sampler with strategy 1 , meaning there will be the same amount of interrested = 1 and 0
rus = RandomUnderSampler(sampling_strategy=1)
# fit and transform the data
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_resampled)))

Original dataset shape Counter({0: 11267, 1: 4131})
Resampled dataset shape Counter({0: 2767, 1: 2767})


In [29]:
# LightGBM with bagging
clf = lgb.LGBMClassifier(bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test)

# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [30]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 63.20%
[[2335 1383]
 [ 487  877]]
ROC AUC score: 69.96%
Precision: 0.38805309734513277
Recall: 0.6429618768328446
f1: 0.48399558498896256
MAP: 0.3453317617900573
NDCG (normalized discounted cumulative gain) : 0.8507451495617941


## Using weights on LIGHTGBM

In [31]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


class_weights = len(y_train) / (2 * np.bincount(y_train))
class_weights={0:class_weights[0],1:class_weights[1]}
clf = lgb.LGBMClassifier(class_weight=class_weights, bagging_freq=5, bagging_fraction=0.7)
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
# with probabilities
y_pred_proba = clf.predict_proba(X_test)[:, 1]



In [32]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1 : {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 68.28%
[[2711 1007]
 [ 605  759]]
ROC AUC score: 70.35%
Precision: 0.4297848244620612
Recall: 0.5564516129032258
f1 : 0.4849840255591054
MAP: 0.3582020778208628
NDCG (normalized discounted cumulative gain) : 0.85901225098661


### hypertuning

#### So far LightGBM has given the best results with centroid sampling approach, we will grid search the most optimal sample number for undersampler and see if it gives a better result

In [33]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
# Calculate class weights
# split train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [35]:
#loading the hp tuning library
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

model=lgb.LGBMClassifier(random_state=42)
undersampler=RandomUnderSampler(random_state=42)
#Setting hyperparameter values for undersampling and oversampling
us_strategy = [1, {0:5000}, {0:1000}
]
#instantiating the hp tuning class
hp_pipeline = Pipeline([
    ('undersampler', undersampler),
    ('model', model)
])

grid_params = {
    'undersampler__sampling_strategy': us_strategy,
}

#Fitting the hp tuner and obtaining the best parameters and model
clf = GridSearchCV(estimator = hp_pipeline, param_grid = grid_params)
clf.fit(X_train, y_train)

print(clf.best_params_)

{'undersampler__sampling_strategy': {0: 5000}}


In [38]:
data = df_train.copy()
X = data.drop("interested", axis=1)
y = data["interested"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
print("Original dataset shape {}".format(Counter(y)))
# define the random under sampler with strategy 1 , meaning there will be the same amount of interrested = 1 and 0
rus = RandomUnderSampler(sampling_strategy={0:5000})
# fit and transform the data
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print("Resampled dataset shape {}".format(Counter(y_resampled)))

Original dataset shape Counter({0: 11267, 1: 4131})
Resampled dataset shape Counter({0: 5000, 1: 2767})


In [39]:
# accuracy
print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
# confusion matrix
print(confusion_matrix(y_test, y_pred))
# roc_auc_score
print("ROC AUC score: %.2f%%" % (roc_auc_score(y_test, y_pred_proba) * 100.0))
# calculate precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# calculate recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"f1: {f1}")
# calculate mean average precision (MAP)
map_score = average_precision_score(y_test, y_pred)
print(f"MAP: {map_score}")
# calculate normalized discounted cumulative gain (NDCG)
ndcg = ndcg_score([y_test], [y_pred])
print(f"NDCG (normalized discounted cumulative gain) : {ndcg}")

Accuracy: 68.28%
[[2711 1007]
 [ 605  759]]
ROC AUC score: 70.35%
Precision: 0.4297848244620612
Recall: 0.5564516129032258
f1: 0.4849840255591054
MAP: 0.3582020778208628
NDCG (normalized discounted cumulative gain) : 0.85901225098661


## The best pipeline is data balancing with centroid approach and lightGBM without Bagging

## Output the recommendations

Use (test.csv + X_test) as the future events

In [40]:
test= pd.read_csv("./local_data/test.csv")
print(test.shape)
test.head()

(10237, 4)


Unnamed: 0,user,event,invited,timestamp
0,1776192,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,1776192,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,1776192,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,1776192,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,1776192,2972428928,0,2012-11-30 11:39:21.985000+00:00


In [41]:
futureEvents = X_test.copy()
# add the predictions to the dataframe as probabilities
futureEvents["predictions"] = y_pred_proba
# take the info we are interested in
futureEvents= futureEvents[["user", "event" ,"predictions"]]
#a list of the users that are in that df
userEventList= futureEvents.user.unique()
user = random.choice(userEventList)

In [42]:
#change the users in test to the ones in futureEvents
test.user = test.user.apply(lambda x: x if x in userEventList else random.choice(userEventList))
test.head()

Unnamed: 0,user,event,invited,timestamp
0,1195601618,2877501688,0,2012-11-30 11:39:01.230000+00:00
1,327414596,3025444328,0,2012-11-30 11:39:01.230000+00:00
2,1421630185,4078218285,0,2012-11-30 11:39:01.230000+00:00
3,4089111119,1024025121,0,2012-11-30 11:39:01.230000+00:00
4,2813753172,2972428928,0,2012-11-30 11:39:21.985000+00:00


In [43]:
df_FE = get_friends_attendee_nums(test, df_user_friends, df_event_attendees)
df_FE

Unnamed: 0,user,event,invited,timestamp,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited
0,1195601618,2877501688,0,2012-11-30 11:39:01.230000+00:00,650,0,0,0,0
1,1195601618,673098017,0,2012-11-01 01:33:59.369000+00:00,650,0,0,0,0
2,1195601618,675888033,0,2012-10-27 05:01:14.586000+00:00,650,0,0,0,0
3,1195601618,1716470668,0,2012-11-10 05:31:34.519000+00:00,650,0,0,0,0
4,1195601618,2190369261,0,2012-11-03 13:51:54.449000+00:00,650,0,0,0,0
...,...,...,...,...,...,...,...,...,...
10232,2995967526,2130383503,0,2012-08-13 09:44:28.365000+00:00,93,0,0,0,0
10233,4152805025,3738629021,0,2012-10-15 04:51:27.189000+00:00,39,0,0,0,0
10234,2321074684,2529072432,0,2012-10-23 21:53:15.318000+00:00,125,0,0,0,0
10235,2321074684,2703431985,0,2012-10-29 14:43:04.467000+00:00,125,0,0,0,0


In [44]:
df_FE = get_event_attendee_nums(df_FE, df_event_attendees)
print(df_FE.shape)
df_FE.head()

(10237, 17)


Unnamed: 0,user,event,invited_x,timestamp,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,yes,maybe,invited_y,no,users_yes,users_no,users_maybe,users_invited_count
0,1195601618,2877501688,0,2012-11-30 11:39:01.230000+00:00,650,0,0,0,0,2532906069 4229871354 1047349682 2489056284 33...,357306026 3230138404 39482604 4061350910 42551...,2808011777 2590224671 3645689275 3559244521 14...,,61,0,19,27
1,1195601618,673098017,0,2012-11-01 01:33:59.369000+00:00,650,0,0,0,0,1020197790 1826497136 708692646 2250861436 211...,783802597 744005121 841254096 258412521 854412...,3728089314 3685208101 955891546 2571609627 222...,1351441753 4085484929 4259259269 3692465869 12...,20,7,6,454
2,770526360,673098017,0,2012-10-31 12:02:18.124000+00:00,552,0,0,0,2,1020197790 1826497136 708692646 2250861436 211...,783802597 744005121 841254096 258412521 854412...,3728089314 3685208101 955891546 2571609627 222...,1351441753 4085484929 4259259269 3692465869 12...,20,7,6,454
3,3308173668,673098017,0,2012-10-30 13:46:39.739000+00:00,610,0,0,0,0,1020197790 1826497136 708692646 2250861436 211...,783802597 744005121 841254096 258412521 854412...,3728089314 3685208101 955891546 2571609627 222...,1351441753 4085484929 4259259269 3692465869 12...,20,7,6,454
4,1227446685,673098017,0,2012-11-01 05:17:24.192000+00:00,53,0,0,0,0,1020197790 1826497136 708692646 2250861436 211...,783802597 744005121 841254096 258412521 854412...,3728089314 3685208101 955891546 2571609627 222...,1351441753 4085484929 4259259269 3692465869 12...,20,7,6,454


In [45]:
df_events_clean = df_events.drop(
    columns=["user_id", "start_time", "city", "state", "zip", "country", "lat", "lng"]
)
# Merge with events
df_FE = pd.merge(
    df_FE, df_events_clean, how="inner", left_on="event", right_on="event_id"
)
df_FE = df_FE.drop(columns=["invited_y", "timestamp", "yes", "maybe", "no"])
df_FE

Unnamed: 0,user,event,invited_x,friends,friends_attending,friends_not_attending,friends_maybe_attending,friends_invited,users_yes,users_no,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,1195601618,2877501688,0,650,0,0,0,0,61,0,...,0,0,0,0,0,0,0,0,0,15
1,1195601618,673098017,0,650,0,0,0,0,20,7,...,0,0,0,0,0,0,0,0,0,30
2,770526360,673098017,0,552,0,0,0,2,20,7,...,0,0,0,0,0,0,0,0,0,30
3,3308173668,673098017,0,610,0,0,0,0,20,7,...,0,0,0,0,0,0,0,0,0,30
4,1227446685,673098017,0,53,0,0,0,0,20,7,...,0,0,0,0,0,0,0,0,0,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10232,1952293169,3766183282,0,1391,0,0,0,0,13,2,...,0,0,0,0,0,0,0,0,0,11
10233,2995967526,1097502776,0,93,0,0,0,0,1,4,...,0,0,0,0,0,0,0,0,0,45
10234,2995967526,2130383503,0,93,0,0,0,0,1,4,...,0,0,0,0,0,0,1,0,0,125
10235,4152805025,3738629021,0,39,0,0,0,0,24,2,...,0,0,0,0,0,0,0,0,0,9


In [46]:
#concat X_test with df_FE
all_future_events = pd.concat([X_test, df_FE], axis=0)
print(all_future_events.shape)

(15319, 115)


In [48]:
#predict the probabilities of all_future_events
clf=clf = lgb.LGBMClassifier()
y_pred_proba = clf.predict_proba(all_future_events)[:, 1]
all_future_events["predictions"] = y_pred_proba
all_future_events.head()

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
futureEvents= all_future_events[[ "user", "event" ,"predictions"]]
futureEvents.head()

In [None]:
#take a random user and sort the events by the predictions
user = random.choice(userEventList)
futureEvents[futureEvents.user == user].sort_values(by="predictions", ascending=False)

In [None]:
#change the format of the df
df= futureEvents.copy()
df
# new column which will be a tuple of the event and the probability
df["event_prob"] = df.apply(lambda x: (x["event"], x["predictions"]), axis=1)
# group by user and take the list of the events and probabilities
df = df.groupby("user")["event_prob"].apply(list).reset_index()
df