In [None]:
#TASK1
import numpy as np
import pandas as pd
from scipy.stats import mode
import time


def euclid_dist(dataset, centers):
    dists = []
    for sample in dataset:
        dists_to_centers = []
        for center in centers:
            d = np.linalg.norm(sample - center)
            dists_to_centers.append(d)
        dists.append(dists_to_centers)
    return np.array(dists)


def cos_dist(dataset, centers):
    dists = []
    for sample in dataset:
        sample_norm = np.linalg.norm(sample)
        dists_to_centers = []
        for center in centers:
            center_norm = np.linalg.norm(center)
            dot = np.dot(sample, center)
            sim = dot / (sample_norm * center_norm)
            d = 1 - sim
            dists_to_centers.append(d)
        dists.append(dists_to_centers)
    return np.array(dists)


def jaccard_dist(dataset, centers):
    dists = []
    for sample in dataset:
        dists_to_centers = []
        for center in centers:
            mins = np.minimum(sample, center)
            maxs = np.maximum(sample, center)
            smin = np.sum(mins)
            smax = np.sum(maxs)
            j_sim = smin / smax if smax > 0 else 0
            d = 1 - j_sim
            dists_to_centers.append(d)
        dists.append(dists_to_centers)
    return np.array(dists)


def assign_labels(dataset, centers, distance_fn):
    distances = distance_fn(dataset, centers)
    return np.argmin(distances, axis=1)


def recompute_centers(dataset, assignments, num_clusters):
    centers = []
    for cluster_idx in range(num_clusters):
        members = dataset[assignments == cluster_idx]
        if len(members) > 0:
            ctr = np.mean(members, axis=0)
        else:
            ctr = np.zeros(dataset.shape[1])
        centers.append(ctr)
    return np.array(centers)


def compute_total_sse(dataset, assignments, centers):
    total = 0
    for idx in range(len(centers)):
        members = dataset[assignments == idx]
        for p in members:
            diff = p - centers[idx]
            sq = diff ** 2
            sse = np.sum(sq)
            total += sse
    return total


def cluster_accuracy(true_labels, pred_assignments, num_clusters):
    correct_count = 0
    for cluster_idx in range(num_clusters):
        idxs = np.where(pred_assignments == cluster_idx)[0]
        if len(idxs) == 0:
            continue
        maj = mode(true_labels[idxs], keepdims=False).mode
        correct_count += np.sum(true_labels[idxs] == maj)
    return correct_count / len(true_labels)


# def run_kmeans(dataset, num_clusters, distance_fn, max_iter=100):
def run_kmeans(dataset, num_clusters, distance_fn, max_iter=500):
    centers = dataset[np.random.choice(len(dataset), num_clusters, replace=False)]
    prev_sse = float("inf")


    for iteration in range(max_iter):
        assignments = assign_labels(dataset, centers, distance_fn)
        new_centers = recompute_centers(dataset, assignments, num_clusters)
        sse = compute_total_sse(dataset, assignments, new_centers)


        if np.allclose(new_centers, centers):
            print(f"no change in centroid position {iteration+1}")
            break
        if sse > prev_sse:
            print(f"SSE increased {iteration+1}")
            break


        centers = new_centers
        prev_sse = sse


    return assignments, centers, sse, iteration + 1


if __name__ == "__main__":
    dataset = pd.read_csv("data.csv", header=None).values
    labels_true = pd.read_csv("label.csv", header=None).values.flatten()


    num_clusters = len(np.unique(labels_true))


    print("\nEuclidean K-Means")
    start = time.time()
    assign_euc, centers_euc, sse_euc, iters_euc = run_kmeans(dataset, num_clusters, euclid_dist)
    time_euc = time.time() - start
    acc_euc = cluster_accuracy(labels_true, assign_euc, num_clusters)


    print("\nCosine K-Means")
    start = time.time()
    assign_cos, centers_cos, sse_cos, iters_cos = run_kmeans(dataset, num_clusters, cos_dist)
    time_cos = time.time() - start
    acc_cos = cluster_accuracy(labels_true, assign_cos, num_clusters)


    print("\nJaccard K-Means")
    start = time.time()
    assign_jac, centers_jac, sse_jac, iters_jac = run_kmeans(dataset, num_clusters, jaccard_dist)
    time_jac = time.time() - start
    acc_jac = cluster_accuracy(labels_true, assign_jac, num_clusters)


    print("\nK-Means using Euclidean:")
    print(f"  SSE: {sse_euc:.4f}")
    print(f"  Accuracy: {acc_euc:.4f}")
    print(f"  Iterations: {iters_euc}")
    print(f"  Time: {time_euc:.4f} seconds\n")


    print("K-Means using Cosine:")
    print(f"  SSE: {sse_cos:.4f}")
    print(f"  Accuracy: {acc_cos:.4f}")
    print(f"  Iterations: {iters_cos}")
    print(f"  Time: {time_cos:.4f} seconds\n")


    print("K-Means using Jaccard:")
    print(f"  SSE: {sse_jac:.4f}")
    print(f"  Accuracy: {acc_jac:.4f}")
    print(f"  Iterations: {iters_jac}")
    print(f"  Time: {time_jac:.4f} seconds\n")


In [None]:
conda install -c conda-forge scikit-surprise


In [None]:
#TASK2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import KNNBasic
from surprise.model_selection import cross_validate


ratings_file = "ratings_small.csv"
ratings_data = pd.read_csv(ratings_file)


reader = Reader(rating_scale=(0.5, 5.0))
dataset = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)


def evaluate_model_with_cv(model, dataset, number_of_folds):
    results = cross_validate(model, dataset, measures=['RMSE', 'MAE'], cv=number_of_folds, verbose=False)
    return results


model_pmf = SVD(biased=False)
results_pmf = evaluate_model_with_cv(model_pmf, dataset, 5)


sim_options_user = {'user_based': True}
model_user_cf = KNNBasic(sim_options=sim_options_user)
results_user_cf = evaluate_model_with_cv(model_user_cf, dataset, 5)


sim_options_item = {'user_based': False}
model_item_cf = KNNBasic(sim_options=sim_options_item)
results_item_cf = evaluate_model_with_cv(model_item_cf, dataset, 5)


def get_average_result(results):
    average_rmse = np.mean(results['test_rmse'])
    average_mae = np.mean(results['test_mae'])
    return average_rmse, average_mae


pmf_rmse, pmf_mae = get_average_result(results_pmf)
user_rmse, user_mae = get_average_result(results_user_cf)
item_rmse, item_mae = get_average_result(results_item_cf)


print(str(pmf_rmse) + " " + str(pmf_mae))
print(str(user_rmse) + " " + str(user_mae))
print(str(item_rmse) + " " + str(item_mae))


similarity_metrics = ['cosine', 'msd', 'pearson']


user_cf_rmse_values = []
item_cf_rmse_values = []


for similarity in similarity_metrics:
    sim_options_user = {'name': similarity, 'user_based': True}
    model_user = KNNBasic(sim_options=sim_options_user)
    result_user = evaluate_model_with_cv(model_user, dataset, 3)
    mean_rmse_user = np.mean(result_user['test_rmse'])
    user_cf_rmse_values.append(mean_rmse_user)


    sim_options_item = {'name': similarity, 'user_based': False}
    model_item = KNNBasic(sim_options=sim_options_item)
    result_item = evaluate_model_with_cv(model_item, dataset, 3)
    mean_rmse_item = np.mean(result_item['test_rmse'])
    item_cf_rmse_values.append(mean_rmse_item)


plt.figure(figsize=(8, 5))
plt.plot(similarity_metrics, user_cf_rmse_values, marker='o', label='User-CF')
plt.plot(similarity_metrics, item_cf_rmse_values, marker='o', label='Item-CF')
plt.ylabel('RMSE')
plt.title('Impact of Similarity Metric on RMSE')
plt.legend()
plt.grid(True)
plt.savefig("similarity_metric_impact.png")
plt.show()


neighbor_cnt = [10, 20, 30, 40, 50]


user_rmse_by_k = []
item_rmse_by_k = []


for neighbor_count in neighbor_cnt:
    sim_options_user = {'name': 'cosine', 'user_based': True}
    model_user = KNNBasic(k=neighbor_count, sim_options=sim_options_user)
    result_user = evaluate_model_with_cv(model_user, dataset, 5)
    mean_rmse_user = np.mean(result_user['test_rmse'])
    user_rmse_by_k.append(mean_rmse_user)


    sim_options_item = {'name': 'cosine', 'user_based': False}
    model_item = KNNBasic(k=neighbor_count, sim_options=sim_options_item)
    result_item = evaluate_model_with_cv(model_item, dataset, 5)
    mean_rmse_item = np.mean(result_item['test_rmse'])
    item_rmse_by_k.append(mean_rmse_item)


plt.figure(figsize=(8, 5))
plt.plot(neighbor_cnt, user_rmse_by_k, marker='o', label='User-CF')
plt.plot(neighbor_cnt, item_rmse_by_k, marker='o', label='Item-CF')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('RMSE')
plt.title('Impact of k on RMSE')
plt.legend()
plt.grid(True)
plt.savefig("neighbor_impact.png")
plt.show()


best_k_user_cf = None
best_rmse_user_cf = float('inf')


for i in range(len(neighbor_cnt)):
    if user_rmse_by_k[i] < best_rmse_user_cf:
        best_rmse_user_cf = user_rmse_by_k[i]
        best_k_user_cf = neighbor_cnt[i]


best_k_item_cf = None
best_rmse_item_cf = float('inf')


for i in range(len(neighbor_cnt)):
    if item_rmse_by_k[i] < best_rmse_item_cf:
        best_rmse_item_cf = item_rmse_by_k[i]
        best_k_item_cf = neighbor_cnt[i]


print("Best k for User-based CF:", best_k_user_cf)
print("Best k for Item-based CF:", best_k_item_cf)
