In [4]:
from datetime import datetime
from collections import Counter

# Data management
import pandas as pd

# Data preprocessing and trasformation (ETL)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, \
    FunctionTransformer, Binarizer, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml, load_iris, make_moons, make_classification


# Math and Stat modules
import numpy as np
from scipy.stats import sem
from random import choice

# Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
    KFold, StratifiedKFold, RepeatedKFold, ShuffleSplit, StratifiedShuffleSplit, learning_curve, validation_curve
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, \
    precision_recall_curve, roc_curve, accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Unsupervised Learning

# Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

# Network
import networkx as nx

# 1 - Link Prediction

**Definition**

Given a social network denoted as $G=(V,E)$, where a timestamp is associated with each edge $e$, the subgraph $G[t,t']$ is defined as the set of edges whose timestamps fall between $t$ and $t'$. 

Given $G[t_{0}, t'_{0}]$, the task of link prediction involves returning a set of edges not present in $G[t_{0}, t'_{0}]$ but that could be present in $G[t_{1}, t'_{1}]$, where $t'_{0}<t_{1}$

In [None]:
beers_temporal = nx.Graph()

# 2 - Recommendation System

In [5]:
from node2vec import Node2Vec

In [6]:
beer_df = pd.read_csv("beer_reviews.csv")

# change the dtype to datetime
beer_df['review_time'] = pd.to_datetime(beer_df['review_time'], unit="s")

# rename some columns in a more handy way
beer_df = beer_df.rename(columns={'beer_beerid': 'beer_id', 'review_profilename':'user'})

# remove the null values
beer_df = beer_df.dropna()

# sub-sample keeping 2012 only
sub_beer_df = beer_df[beer_df['review_time'].dt.year == 2012].copy()
sub_beer_df.shape

(9343, 13)

In [7]:
# keep only profiles that reviewed more than one beer
users_reviews_count = sub_beer_df['user'].value_counts()
kept_profiles_only = users_reviews_count[users_reviews_count>1].index

sub_beer_df = sub_beer_df[sub_beer_df['user'].isin(kept_profiles_only)].copy()
sub_beer_df.shape

(8646, 13)

In [8]:
# Create an empty graph
user_beer_graph = nx.Graph()

# Add nodes for users and beers
user_beer_graph.add_nodes_from(sub_beer_df['user'], bipartite='user')
user_beer_graph.add_nodes_from(sub_beer_df['beer_name'], bipartite='beer')

# Add edges representing user-beer interactions
for _, row in sub_beer_df.iterrows():
    user_beer_graph.add_edge(row['user'], row['beer_name']) #, rating=row['beer_style'])  # Assuming 'rating' is an attribute in your dataset

nx.bipartite.is_bipartite(user_beer_graph)

True

In [17]:
# generate random walks
node2vec = Node2Vec(user_beer_graph, dimensions=16, 
                    walk_length=12, num_walks=50,
                    p=2, q=1, workers=2)

model = node2vec.fit(window=10, min_count=2)

def recommend(user, num_recommendations=10):
    # Get the embeddings of the input user
    user_embedding = model.wv[str(user)]

    # Find beers that are most similar to the input user
    similar_beers = model.wv.most_similar([user_embedding], topn=num_recommendations+1)

    # Filter out the input user from the recommended list
    similar_beers = [(beer, similarity) for beer, similarity in similar_beers if beer != user]    

    # ensure that the suitable recommended beers haven't been reviewed yet
    # already_reviewed = set(sub_beer_df[sub_beer_df['user']==user]['beer_name'])
    # similar_beers = [(beer, similarity) for beer, similarity in similar_beers if beer not in already_reviewed]

    return similar_beers[:num_recommendations]

recommended_beers = recommend('Knapp85', num_recommendations=10)
print(recommended_beers)

Computing transition probabilities: 100%|██████████| 5254/5254 [00:00<00:00, 10063.16it/s]


[('Hitachino Nest Red Rice Ale', 0.9970855712890625), ('La Rullés Estivale (Bière De Gaume)', 0.9961832165718079), ('Belgian White', 0.9945496320724487), ('Heineken Dark Lager', 0.9943629503250122), ('John Labatt Classic', 0.994253933429718), ('Sinebrychoff Porter', 0.9938508868217468), ('Julius Echter Hefe-Weissbier Hell', 0.9937678575515747), ("He'Brew Rejewvenator (Year Of The Date) 2009", 0.9934571981430054), ('Coffee Stout (Brewmaster Series)', 0.9933536052703857), ("D' Inn'Staade", 0.9932727217674255)]
