In [20]:
import numpy as np
import pandas as pd
import skfuzzy as fuzz
from surprise import reader
from surprise.prediction_algorithms import SlopeOne
from surprise import Dataset
import matplotlib.pyplot
from sklearn.model_selection import train_test_split, StratifiedKFold
from statistics import NormalDist

In [21]:
df = pd.read_csv("train.csv")

In [22]:
df.head()

Unnamed: 0,user,item,rating
0,877,381,4.0
1,877,451,4.0
2,877,557,4.0
3,877,692,4.0
4,877,56,5.0


In [23]:
df.shape

(80000, 3)

In [25]:
# df.drop("timestamp",axis=1,inplace=True)

In [26]:
rating_low = NormalDist(mu=2, sigma=1)
rating_high = NormalDist(mu=4, sigma=1)

In [27]:
def fuzzify(rating):
    low = 1-rating_low.cdf(rating)
    high = rating_high.cdf(rating)
    mid = 1-low-high
    return low,mid,high

In [28]:

profile = df.rating.apply(fuzzify)

In [29]:
r_low = []
r_mid = []
r_high = []

In [30]:
for prof in profile:
    r_low.append(prof[0])
    r_mid.append(prof[1])
    r_high.append(prof[2])
df["r_low"] = r_low
df["r_mid"] = r_mid
df["r_high"] = r_high

In [31]:
df.head()

Unnamed: 0,user,item,rating,r_low,r_mid,r_high
0,877,381,4.0,0.02275,0.47725,0.5
1,877,451,4.0,0.02275,0.47725,0.5
2,877,557,4.0,0.02275,0.47725,0.5
3,877,692,4.0,0.02275,0.47725,0.5
4,877,56,5.0,0.00135,0.157305,0.841345


In [32]:
df_user = df[["user","r_low","r_mid","r_high"]].groupby("user").mean()

In [33]:
df_item = df[["item","r_low","r_mid","r_high"]].groupby("item").mean()

In [34]:
rmin = df.rating.min()
rmax = df.rating.max()
a = rmin + round((rmax-rmin)/3)
b = rmax - round((rmax-rmin)/3)

In [50]:
def get_estimated_ratings(row):
    user = row["user"]
    item = row["item"]
    rating = row["rating"]
    r_low = row["r_low"]
    r_mid = row["r_mid"]
    r_high = row["r_high"]
    user_row = df_user.loc[user]
    u_low = user_row["r_low"]
    u_mid = user_row["r_mid"]
    u_high = user_row["r_high"]
    item_row = df_item.loc[item]
    i_low = item_row["r_low"]
    i_mid = item_row["r_mid"]
    i_high = item_row["r_high"]

    n_low = u_low*i_low
    n_mid = u_mid*i_mid
    n_high = u_high*i_high
    total = n_low+n_mid+n_high
    n_low = n_low/total
    n_mid = n_mid/total
    n_high = n_high/total

    r_v = np.array([r_low,r_mid,r_high])
    n_v = np.array([n_low,n_mid,n_high])
    dist = np.linalg.norm(r_v-n_v)
    s = 1/(1+dist)
    delta = 0.6

    if s<delta:
        if n_low == max(n_low,n_mid,n_high):
            return a
        elif n_mid == max(n_low,n_mid,n_high):
            return (a+b)/2
        else:
            return b
    else:
        return rating

In [51]:
df["corrected_rating"] = df.apply(get_estimated_ratings,axis=1)

In [37]:
df.head()

Unnamed: 0,user,item,rating,r_low,r_mid,r_high,corrected_rating
0,877,381,4.0,0.02275,0.47725,0.5,4.0
1,877,451,4.0,0.02275,0.47725,0.5,4.0
2,877,557,4.0,0.02275,0.47725,0.5,4.0
3,877,692,4.0,0.02275,0.47725,0.5,4.0
4,877,56,5.0,0.00135,0.157305,0.841345,4.0


In [38]:
# df[["user","item","rating","corrected_rating"]].to_csv("ml100k_corrected_wang.csv",index=False)

In [39]:
from surprise import Dataset
from surprise import SlopeOne, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae
from surprise.reader import Reader


In [40]:
import numpy as np
import pandas as pd

In [41]:
def get_accuracy(algo, trainset, testset):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse_error = rmse(predictions)
    mae_error = mae(predictions)
    print(f"For {algo} RMSE is {rmse_error} and MAE is {mae_error}")
    return algo, rmse_error, mae_error

In [42]:
def run_algos(trainset,testset):
    algorithms = [
        SlopeOne(),
        KNNBasic(k=60, sim_options={"name": "pearson", "user_based": True}),
        KNNBasic(k=60, sim_options={"name": "pearson", "user_based": False}),
    ]

    results = []
    for algo in algorithms:
        results.append(get_accuracy(algo, trainset, testset))
    return results

In [43]:
# df = pd.read_csv("ml100k_corrected_wang.csv")

In [44]:
test_df = pd.read_csv("test.csv")

In [45]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
test_data = Dataset.load_from_df(test_df[["user", "item", "rating"]], reader)
trainset = data.build_full_trainset()
testset = test_data.build_full_trainset().build_testset()
print("Training set size: ", trainset.n_ratings)
print("Test set size: ", len(testset))


Training set size:  80000
Test set size:  20000


In [46]:
results_base = run_algos(trainset,testset)

RMSE: 0.9423
MAE:  0.7414
For <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001A83FCB9DF0> RMSE is 0.9422541799329658 and MAE is 0.7413586954857572
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0135
MAE:  0.8027
For <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001A83FCB9FD0> RMSE is 1.0134516193372387 and MAE is 0.8026670964157997
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0337
MAE:  0.8261
For <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001A83FCB9E20> RMSE is 1.0337113290316722 and MAE is 0.826057756099029


In [28]:
results_dict = {"algo": [], "rmse": [], "mae": [],"type":[]}
for algo_, rmse_, mae_ in results_base:
    results_dict["algo"].append(repr(algo_).split(".")[-1].split(" ")[0])
    results_dict["rmse"].append(rmse_)
    results_dict["mae"].append(mae_)
    results_dict["type"].append("base")

In [48]:
data = Dataset.load_from_df(df[["user", "item", "corrected_rating"]], reader)
trainset = data.build_full_trainset()
print("Training set size: ", trainset.n_ratings)
print("Test set size: ", len(testset))

Training set size:  80000
Test set size:  20000


In [49]:
results_estimated = run_algos(trainset,testset)

RMSE: 0.9759
MAE:  0.7773
For <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001A83DC992E0> RMSE is 0.9758576070636537 and MAE is 0.7773165710152085
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0339
MAE:  0.8244
For <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001A83A37AA90> RMSE is 1.0338962521731319 and MAE is 0.82438617399556
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0506
MAE:  0.8390
For <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001A83B4415E0> RMSE is 1.0505762631952522 and MAE is 0.8390204207758583


In [31]:
for algo_, rmse_, mae_ in results_estimated:
    results_dict["algo"].append(repr(algo_).split(".")[-1].split(" ")[0])
    results_dict["rmse"].append(rmse_)
    results_dict["mae"].append(mae_)
    results_dict["type"].append("estimated")

In [32]:
results_df = pd.DataFrame(results_dict)

In [33]:
results_df

Unnamed: 0,algo,rmse,mae,type
0,SlopeOne,0.942254,0.741359,base
1,KNNBasic,1.013456,0.80267,base
2,KNNBasic,1.033711,0.826058,base
3,SlopeOne,0.382929,0.3081,estimated
4,KNNBasic,0.479708,0.372185,estimated
5,KNNBasic,0.472036,0.378978,estimated


In [34]:
results_df.to_csv("results_wang.csv",index=False)