In [None]:
import os
from collections import defaultdict
import math
import networkx as nx
import random
from tqdm import tqdm
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


step 1: loading the datasets

In [None]:
# # loading the datasets and preprocessing
# # loading movies
# movies_path = '/content/drive/MyDrive/movielens/raw/ml-100k/u.item'
# movies = pd.read_csv(movies_path, sep='|', encoding='ISO-8859-1', header=None)
# genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
#                  'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
#                  'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# column_names = ['movie_id', 'title', 'release_date', 'video_release_date',
#                 'IMDb_URL'] + genre_columnså
# movies.columns = column_names
# def concatenate_genres(row):
#     genres = [genre for genre in genre_columns if row[genre] == 1]
#     return '|'.join(genres)
# movies['genres'] = movies.apply(concatenate_genres, axis=1)
# movies = movies[['movie_id', 'title', 'genres']]
# movies.loc[:, 'movie_id'] = movies.loc[:, 'movie_id'].apply(lambda x: f'movie_{x}')

# # loading ratings
# ratings_path = '/content/drive/MyDrive/movielens/raw/ml-100k/u.data'
# ratings = pd.read_csv(ratings_path, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
# ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
# ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
# ratings = ratings.sort_values(by=['user_id', 'movie_id'])


# print("Ratings data shape:", movies.shape)
# print("Ratings data shape:", ratings.shape)

In [None]:
movies_path = '/content/drive/MyDrive/movielens/raw/ml-25m/movies.csv'
ratings_path = '/content/drive/MyDrive/movielens/raw/ml-25m/ratings.csv'
# Load movies to a DataFrame.
movies = pd.read_csv(movies_path)
# Create a `movieId` string.
movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")

# Load ratings to a DataFrame.
ratings = pd.read_csv(ratings_path)
# Convert the `ratings` to floating point
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
# Create the `movie_id` string.
ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")

print("Movies data shape:", movies.shape)
print("Ratings data shape:", ratings.shape)
movies.rename(columns = {'movieId':'movie_id'}, inplace = True)
ratings.rename(columns = {'movieId':'movie_id', 'userId': 'user_id'}, inplace = True)

Movies data shape: (62423, 3)
Ratings data shape: (25000095, 4)


In [None]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,movie_1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,movie_2,Jumanji (1995),Adventure|Children|Fantasy
2,movie_3,Grumpier Old Men (1995),Comedy|Romance
3,movie_4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,movie_5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,movie_296,5.0,1147880044
1,1,movie_306,3.5,1147868817
2,1,movie_307,5.0,1147868828
3,1,movie_665,5.0,1147878820
4,1,movie_899,3.5,1147868510


step 1.1: defining utility functions

In [None]:
def get_movie_title_by_id(movieId):
    return list(movies[movies.movie_id == movieId].title)[0]

def get_movie_id_by_title(title):
    return list(movies[movies.title == title].movie_id)[0]

In [None]:
# checking utility functions
print(get_movie_title_by_id('movie_1'))
print(get_movie_id_by_title('Toy Story (1995)'))

Toy Story (1995)
movie_1


step 2: creating weighted edges between movies

In [None]:
min_rating = 5
pair_frequency = defaultdict(int)
item_frequency = defaultdict(int)

rated_movies = ratings[ratings.rating >= min_rating]
movies_grouped_by_users = list(rated_movies.groupby("user_id"))
for group in tqdm(
    movies_grouped_by_users,
    position=0,
    leave=True,
    desc="Computing movie rating frequencies",
):
    current_movies = list(group[1]["movie_id"])

    for i in range(len(current_movies)):
        item_frequency[current_movies[i]] += 1
        for j in range(i + 1, len(current_movies)):
            x = min(current_movies[i], current_movies[j])
            y = max(current_movies[i], current_movies[j])
            pair_frequency[(x, y)] += 1

Computing movie rating frequencies: 100%|██████████| 155002/155002 [03:27<00:00, 747.88it/s]


In [None]:
user_movie_counts = [len(group[1]["movie_id"]) for group in movies_grouped_by_users]
print("Average number of high-rated movies per user:", sum(user_movie_counts) / len(user_movie_counts))

Average number of high-rated movies per user: 23.305983148604533


step 3: creating graph using nodes and edges

In [None]:
min_weight = 30
D = math.log(sum(item_frequency.values()))
movies_graph = nx.Graph()
for pair in tqdm(
    pair_frequency, position=0, leave=True, desc="Creating the movie graph"
):
    x, y = pair
    xy_frequency = pair_frequency[pair]
    x_frequency = item_frequency[x]
    y_frequency = item_frequency[y]
    pmi = math.log(xy_frequency) - math.log(x_frequency) - math.log(y_frequency) + D
    weight = pmi * xy_frequency
    if weight >= min_weight:
        movies_graph.add_edge(x, y, weight=weight)

Creating the movie graph:   0%|          | 35838/35323774 [00:00<01:38, 358353.86it/s]

In [None]:
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(movies_graph)
nx.draw_networkx_nodes(movies_graph, pos, node_size=100)
nx.draw_networkx_edges(movies_graph, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(movies_graph, pos, font_size=5, font_family="sans-serif")
plt.axis("off")
plt.show()
plt.savefig('ml-100k-graph.png')

In [None]:
print("Total number of graph nodes:", movies_graph.number_of_nodes())
print("Total number of graph edges:", movies_graph.number_of_edges())

In [None]:
degrees = []
for node in movies_graph.nodes:
    degrees.append(movies_graph.degree[node])

print("Average node degree:", round(sum(degrees) / len(degrees), 2))

step 3: create a dictionary from movies to indices

In [None]:
vocabulary = ["NA"] + list(movies_graph.nodes)
vocabulary_lookup = {token: idx for idx, token in enumerate(vocabulary)}

step 4: implementing biased random walk

In [None]:
def next_step(graph, previous, current, p, q):
    neighbors = list(graph.neighbors(current))

    weights = []
    for neighbor in neighbors:
        if neighbor == previous:
            weights.append(graph[current][neighbor]["weight"] / p)
        elif graph.has_edge(neighbor, previous):
            weights.append(graph[current][neighbor]["weight"])
        else:
            weights.append(graph[current][neighbor]["weight"] / q)
    weight_sum = sum(weights)
    probabilities = [weight / weight_sum for weight in weights]
    next = np.random.choice(neighbors, size=1, p=probabilities)[0]
    return next

def random_walk(graph, num_walks, num_steps, p, q):
    walks = []
    nodes = list(graph.nodes())
    for walk_iteration in range(num_walks):
        random.shuffle(nodes)
        for node in tqdm(
            nodes,
            position=0,
            leave=True,
            desc=f"Random walks iteration {walk_iteration + 1} of {num_walks}",
        ):
            walk = [node]
            while len(walk) < num_steps:
                current = walk[-1]
                previous = walk[-2] if len(walk) > 1 else None
                next = next_step(graph, previous, current, p, q)
                walk.append(next)
            walk = [vocabulary_lookup[token] for token in walk]
            walks.append(walk)
    return walks

In [None]:
# walk return param
p = 1
# walk in-out param
q = 1
num_walks = 10
num_steps = 7
walks = random_walk(movies_graph, num_walks, num_steps, p, q)
print("\nNumber of walks generated:", len(walks))

step 5: generating positive and negative sampling

In [None]:
def skipgram(
    sequence,
    vocabulary_size,
    window_size=4,
    negative_samples=1.0,
    shuffle=True,
    categorical=False,
    sampling_table=None,
    seed=None,
):
    couples = []
    labels = []
    for i, wi in enumerate(sequence):
        if not wi:
            continue
        if sampling_table is not None:
            if sampling_table[wi] < random.random():
                continue

        window_start = max(0, i - window_size)
        window_end = min(len(sequence), i + window_size + 1)
        for j in range(window_start, window_end):
            if j != i:
                wj = sequence[j]
                if not wj:
                    continue
                couples.append([wi, wj])
                if categorical:
                    labels.append([0, 1])
                else:
                    labels.append(1)
    if negative_samples > 0:
        num_negative_samples = int(len(labels) * negative_samples)
        words = [c[0] for c in couples]
        random.shuffle(words)

        couples += [
            [words[i % len(words)], random.randint(1, vocabulary_size - 1)]
            for i in range(num_negative_samples)
        ]
        if categorical:
            labels += [[1, 0]] * num_negative_samples
        else:
            labels += [0] * num_negative_samples
    if shuffle:
        if seed is None:
            seed = random.randint(0, 10e6)
        random.seed(seed)
        random.shuffle(couples)
        random.seed(seed)
        random.shuffle(labels)

    return couples, labels

In [None]:
def generate_examples(sequences, window_size, num_negative_samples, vocabulary_size):
    example_weights = defaultdict(int)
    sample_size = 10
    sample_pairs_labels = []
    for sequence in tqdm(
        sequences,
        position=0,
        leave=True,
        desc=f"Generating postive and negative examples",
    ):
        pairs, labels = skipgram(
            sequence,
            vocabulary_size=vocabulary_size,
            window_size=window_size,
            negative_samples=num_negative_samples)
        if len(sample_pairs_labels) < sample_size:
            sample_indices = random.sample(range(len(pairs)), min(sample_size - len(sample_pairs_labels), len(pairs)))
            sample_pairs_labels.extend([(pairs[i], labels[i]) for i in sample_indices])
        for idx in range(len(pairs)):
            pair = pairs[idx]
            label = labels[idx]
            target, context = min(pair[0], pair[1]), max(pair[0], pair[1])
            if target == context:
                continue
            entry = (target, context, label)
            example_weights[entry] += 1
    for pair, label in sample_pairs_labels:
        label_type = "Positive" if label == 1 else "Negative"
        print(f"Pair: {pair}, Label: {label} ({label_type})")
    targets, contexts, labels, weights = [], [], [], []
    for entry in example_weights:
        weight = example_weights[entry]
        target, context, label = entry
        targets.append(target)
        contexts.append(context)
        labels.append(label)
        weights.append(weight)
    return np.array(targets), np.array(contexts), np.array(labels), np.array(weights)
num_negative_samples = 4
targets, contexts, labels, weights = generate_examples(
    sequences=walks,
    window_size=num_steps,
    num_negative_samples=num_negative_samples,
    vocabulary_size=len(vocabulary)
)

In [None]:
print(f"Targets shape: {targets.shape}")
print(f"Contexts shape: {contexts.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Weights shape: {weights.shape}")
assert len(targets) == len(contexts) == len(labels), "Lengths of targets, contexts, and labels should be equal"
for i in range(10):
    if labels[i] == 1:
      print("target: ", get_movie_title_by_id(f'movie_{targets[i]}'))
      print("context: ", get_movie_title_by_id(f'movie_{contexts[i]}'))
positive_examples = sum(labels)
negative_examples = len(labels) - positive_examples
print(f"Positive examples: {positive_examples}, Negative examples: {negative_examples}")
unique_pairs = set(zip(targets, contexts))
for i, pair in enumerate(unique_pairs):
    if i > 10: break
    print(f"Pair: {pair}, Weight: {weights[(targets == pair[0]) & (contexts == pair[1])].sum()}")

step 6: convert data to a dataloader

In [None]:
from torch.utils.data import Dataset, DataLoader

batch_size = 1024

class SkipGramDataset(Dataset):
    def __init__(self, targets, contexts, labels, weights):
        self.targets = targets
        self.contexts = contexts
        self.labels = labels
        self.weights = weights
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        return {
            "target": self.targets[idx],
            "context": self.contexts[idx],
            "label": self.labels[idx],
            "weight": self.weights[idx]}

def create_dataset(targets, contexts, labels, weights, batch_size, num_workers=0):
    dataset = SkipGramDataset(targets, contexts, labels, weights)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=2)
    return loader

assert len(targets) == len(contexts) == len(labels) == len(weights), "Input arrays must have the same length"

dataset = create_dataset(
    targets=targets,
    contexts=contexts,
    labels=labels,
    weights=weights,
    batch_size=batch_size,
)

step 7: training the dataloader

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

class SkipGramModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, l2_reg=1e-6):
        super(SkipGramModel, self).__init__()
        self.embed_item = nn.Embedding(vocabulary_size, embedding_dim)
        self.l2_reg = l2_reg
        init.kaiming_normal_(self.embed_item.weight, mode='fan_out')

    def forward(self, target, context):
        target_embeddings = self.embed_item(target)
        context_embeddings = self.embed_item(context)
        logits = (target_embeddings * context_embeddings).sum(dim=1)
        return logits

    def get_regularization_loss(self):
        reg_loss = 0.0
        for param in self.embed_item.parameters():
            reg_loss += torch.sum(param ** 2)
        return self.l2_reg * reg_loss

vocabulary_size = len(vocabulary)
embedding_dim = 50
model = SkipGramModel(vocabulary_size, embedding_dim)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)
loss_function = nn.BCEWithLogitsLoss()

In [None]:
num_epochs = 11
# losses = []

# for epoch in range(num_epochs):
#     total_loss = 0
#     for data in dataset:
#         target, context = data['target'], data['context']
#         labels = data['label'].float()
#         model.zero_grad()
#         logits = model(target, context)
#         reg_loss = model.get_regularization_loss()
#         loss = loss_function(logits, labels) + reg_loss
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item() * target.size(0)
#     avg_loss = total_loss / len(dataset.dataset)
#     print(f"Epoch {epoch}: Loss = {avg_loss}")
#     losses.append(avg_loss)

def train_and_evaluate(model, dataset, num_epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
    loss_function = nn.BCEWithLogitsLoss()
    losses = []
    for epoch in range(num_epochs):
        total_loss = 0
        for data in dataset:
            target, context = data['target'], data['context']
            labels = data['label'].float()
            model.zero_grad()
            logits = model(target, context)
            reg_loss = model.get_regularization_loss()
            loss = loss_function(logits, labels) + reg_loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * target.size(0)
        avg_loss = total_loss / len(dataset.dataset)
        print(f"Epoch {epoch}: Loss = {avg_loss}")
        losses.append(avg_loss)

    return losses[-1]  # Return the final average loss

# Parameters
num_runs = 10
final_losses = []

# Main loop for multiple runs
for _ in range(num_runs):
    model = SkipGramModel(vocabulary_size, embedding_dim)  # Reinitialize model
    final_loss = train_and_evaluate(model, dataset, num_epochs)
    final_losses.append(final_loss)

# Calculating mean and standard error
loss_mean = np.mean(final_losses)
loss_std_error = np.std(final_losses, ddof=1) / np.sqrt(len(final_losses))

# Displaying results in a table
results_df = pd.DataFrame({
    "Run": range(1, num_runs + 1),
    "Final Loss": final_losses
})
results_df.loc['Mean'] = [np.nan, loss_mean]
results_df.loc['Std. Error'] = [np.nan, loss_std_error]

print(results_df)

# Visualizing results in a figure
plt.errorbar(range(1, num_runs + 1), final_losses, yerr=[loss_std_error]*num_runs, fmt='o')
plt.title("Final Loss Across Runs")
plt.xlabel("Run")
plt.ylabel("Final Loss")
plt.xticks(range(1, num_runs + 1))  # Set x-ticks to match the number of runs
plt.grid(True)
plt.show()

In [None]:
plt.plot(losses)
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.title("Training Loss Over Epochs")
plt.show()

In [None]:
movie_embeddings = model.embed_item.weight.detach().numpy()
print("Embeddings shape:", movie_embeddings.shape)

In [None]:
query_movies = [
    "Toy Story (1995)",
    "GoldenEye (1995)",
    "Four Rooms (1995)",
]

In [None]:
query_embeddings = []

for movie_title in query_movies:
    movie_id = get_movie_id_by_title(movie_title)
    token_id = vocabulary_lookup[movie_id]
    movie_embedding = movie_embeddings[token_id]
    query_embeddings.append(movie_embedding)

query_embeddings = np.array(query_embeddings)

In [None]:
def normalize_embeddings_torch(embeddings):
    norms = torch.norm(embeddings, p=2, dim=1, keepdim=True)
    return embeddings / norms

query_embeddings_tensor = torch.tensor(query_embeddings).float()
movie_embeddings_tensor = torch.tensor(movie_embeddings).float()

normalized_query_embeddings_torch = normalize_embeddings_torch(query_embeddings_tensor)
normalized_movie_embeddings_torch = normalize_embeddings_torch(movie_embeddings_tensor)

similarities_torch = torch.matmul(normalized_query_embeddings_torch, normalized_movie_embeddings_torch.T)

k = 5
_, indices_torch = torch.topk(similarities_torch, k=k)

indices_list = indices_torch.numpy().tolist()

In [None]:
top_k = 5

for idx, title in enumerate(query_movies):
    print(title)
    print("".rjust(len(title), "-"))
    similar_tokens = indices_list[idx]
    for token in similar_tokens:
        if token == 0:
          continue
        similar_movieId = vocabulary[token]
        similar_title = get_movie_title_by_id(similar_movieId)
        print(f"- {similar_title}")
    print()


In [None]:
# !pip freeze > requirements. txt