In [9]:
import numpy as np
import pandas as pd
import skfuzzy as fuzz
from surprise import reader
from surprise.prediction_algorithms import SlopeOne
from surprise import Dataset
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv("data/ml100k.csv")

In [11]:
df.drop("timestamp", axis=1, inplace=True)

In [12]:
max_rating=5

In [13]:
rating = np.arange(1, max_rating + 1, 1)

diff = (max_rating - 1) / 2

rating_low = fuzz.membership.gaussmf(rating, 1, diff)
rating_mid = fuzz.membership.gaussmf(rating, 1 + diff, diff)
rating_high = fuzz.membership.gaussmf(rating, 1 + 2 * diff, diff)

In [14]:
df["r_low"] = df["rating"].apply(
    lambda x: fuzz.interp_membership(rating, rating_low, x)
)
df["r_mid"] = df["rating"].apply(
    lambda x: fuzz.interp_membership(rating, rating_mid, x)
)
df["r_high"] = df["rating"].apply(
    lambda x: fuzz.interp_membership(rating, rating_high, x)
)


In [15]:
df_user = df[["user", "r_low", "r_mid", "r_high"]].groupby("user").mean()
df_item = df[["item", "r_low", "r_mid", "r_high"]].groupby("item").mean()

In [64]:
def get_estimated_ratings(row):
    user = row["user"]
    item = row["item"]
    current_rating = row["rating"]

    r_low = row["r_low"]
    r_mid = row["r_mid"]
    r_high = row["r_high"]

    user_low = df_user.loc[user]["r_low"]
    user_mid = df_user.loc[user]["r_mid"]
    user_high = df_user.loc[user]["r_high"]
    # print(user_low, user_mid, user_high)
    item_low = df_item.loc[item]["r_low"]
    item_mid = df_item.loc[item]["r_mid"]
    item_high = df_item.loc[item]["r_high"]
    # print(item_low, item_mid, item_high)

    estimated_low = (user_low * item_low)
    estimated_mid = (user_mid * item_mid)
    estimated_high = (user_high * item_high)
    total = estimated_low + estimated_mid + estimated_high
    estimated_low = estimated_low / total
    estimated_mid = estimated_mid / total
    estimated_high = estimated_high / total
    
    # print(estimated_low, estimated_mid, estimated_high)
    r_v = np.array([r_low, r_mid, r_high])
    e_v = np.array([estimated_low, estimated_mid, estimated_high])
    dist = np.linalg.norm(r_v - e_v)
    s = 1/(1+dist)
    # print(s)
    if s < 0.8:
        e_low = np.fmin(estimated_low,rating_low)
        e_mid = np.fmin(estimated_mid,rating_mid)
        e_high = np.fmin(estimated_high,rating_high)

        aggregated = np.fmax(e_low, np.fmax(e_mid, e_high))
        estimated = fuzz.defuzz(rating, aggregated, "mom")
        return estimated
    else:
        return current_rating

In [60]:
df.iloc[250]

user         301.000000
item         401.000000
rating         4.000000
r_low          0.324652
r_mid          0.882497
r_high         0.882497
estimated      3.000000
Name: 250, dtype: float64

In [61]:
get_estimated_ratings(df.iloc[250])

3.0

In [65]:
df["estimated"] = df.apply(get_estimated_ratings, axis=1)

In [66]:
df.estimated.describe()

count    100000.000000
mean          3.128390
std           0.347688
min           1.500000
25%           3.000000
50%           3.000000
75%           3.000000
max           4.500000
Name: estimated, dtype: float64

In [69]:
round = df.estimated.apply(round)

In [70]:
round.value_counts()

3    86358
4    13254
2      388
Name: estimated, dtype: int64

In [44]:
df.rating.value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

In [45]:
# df[["user","item","rating","estimated"]].to_csv("ml100k_corrected_my.csv",index=False)

In [46]:
from surprise import Dataset
from surprise import SlopeOne, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae
from surprise.reader import Reader


In [47]:
def get_accuracy(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse_error = rmse(predictions)
    mae_error = mae(predictions)
    print(f"For {algo} RMSE is {rmse_error} and MAE is {mae_error}")
    return algo, rmse_error, mae_error

In [50]:
def run_algos(trainset,testset):
    algorithms = [
        SlopeOne(),
        # KNNBasic(k=60, sim_options={"name": "pearson", "user_based": True}),
        # KNNBasic(k=60, sim_options={"name": "pearson", "user_based": False}),
    ]

    results = []
    for algo in algorithms:
        results.append(get_accuracy(algo, trainset, testset))
    return results

In [51]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print("Training set size: ", trainset.n_ratings)
print("Test set size: ", len(df) - trainset.n_ratings)


Training set size:  80000
Test set size:  20000


In [52]:
results_base = run_algos(trainset,testset)

RMSE: 0.9423
MAE:  0.7414
For <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001FAD1C49E50> RMSE is 0.9422541799329658 and MAE is 0.7413586954857572


In [74]:
df = df.assign(estimated=2)

In [75]:
data = Dataset.load_from_df(df[["user", "item", "estimated"]], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print("Training set size: ", trainset.n_ratings)
print("Test set size: ", len(df) - trainset.n_ratings)

Training set size:  80000
Test set size:  20000


In [76]:
results_estimated = run_algos(trainset,testset)

RMSE: 0.0000
MAE:  0.0000
For <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001FAD8364F40> RMSE is 0.0 and MAE is 0.0


In [77]:
wang = pd.read_csv("ml100k_corrected_wang.csv")

In [78]:
wang.corrected_rating.value_counts()

3.0    49869
4.0    45025
5.0     3585
2.0     1226
1.0      295
Name: corrected_rating, dtype: int64

In [79]:
wang.rating.value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

In [80]:
ts = data.build_full_trainset()

In [81]:
ts.n_ratings

100000

In [82]:
tts = ts.build_testset()

In [84]:
len(tts)

100000