In [15]:
import pandas as pd
import networkx as nx
import numpy as np
from pathlib import Path
from collections import defaultdict
import os

In [16]:
def createGraph(input_data):
    df = pd.read_csv(input_data, usecols = ['UserId','ProductId','Score'])
    df['source'] = 'u'+df['UserId'].astype(str)
    df['target'] = 'p'+df['ProductId'].astype(str)
    df['weight'] = df['Score'].map({1: -1.0,
                                       2: -0.5,
                                       3: 0,
                                       4: 0.5,
                                       5: 1.0, })
    X = df[['source', 'target', 'weight']]
    G = nx.from_pandas_edgelist(X, edge_attr=True, create_using=nx.DiGraph())
    return G

In [17]:
def iterate(input_data, c1, c2, c3, c4, c5, c6, c7):   
    alpha1 = c1
    alpha2 = c2

    beta1 = c3
    beta2 = c4

    gamma1 = c5
    gamma2 = c6
    gamma3 = c7
    G = createGraph(input_data)

    nodes = G.nodes()
    edges = G.edges(data=True)

    print(f"{input_data} has {len(nodes)} nodes and {len(edges)} edges")
    
    df = nx.to_pandas_edgelist(G)

    user_names = [node for node in nodes if node.startswith('u')]
    product_names = [node for node in nodes if node.startswith('p')]

    num_users = len(user_names)
    num_products = len(product_names)
    user_map = dict(zip(user_names, range(len(user_names))))
    product_map = dict(zip(product_names, range(len(product_names))))
    
    ##### Initialization Fairness, Goodness, Reliability
    for node in nodes:
        if node.startswith("u"):
            G.node[node]["fairness"] = 1
        else:
            G.node[node]["goodness"] = 1
    for edge in edges:
        G[edge[0]][edge[1]]["fairness"] = 1

    dp = 0
    du = 0
    dr = 0
    max_epochs = 100

    for epoch in range(max_epochs):
        print('-----------------')
        print('Epoch number %d with du = %f, dp = %f, dr = %f, for (%d,%d,%d,%d,%d,%d,%d)' % (
            epoch, du, dp, dr, alpha1, alpha2, beta1, beta2, gamma1, gamma2, gamma3))
        if np.isnan(du) or np.isnan(dp) or np.isnan(dr):
            break
        dp = 0
        du = 0
        dr = 0
    ########################## Updating goodness of product ##################################  
        print('Updating goodness of product')
        # goodness
        currentgvals = [G.node[node]["goodness"] for node in nodes if node.startswith('p')]
        
        # mean or median goodness(μg)
        mean_gvals = np.mean(currentgvals)
        
        for node in nodes:
            if node.startswith('p'):
                inedges = G.in_edges(node, data=True)
                ftotal = len(inedges) 
                gtotal = np.sum([edge[2]['fairness'] * edge[2]['weight'] for edge in inedges])  # sum(R(u,p) * score(u,p)) fairness
                if ftotal > 0.0:
                    # added cold start treatment /not yet added behavior property component
                    mean_rating_fairness = (gtotal + beta1*mean_gvals) / (ftotal + beta1) 
                else:
                    mean_rating_fairness = 0.0
                    
                mean_rating_fairness = np.clip(mean_rating_fairness, -1, 1)  # normalize to [-1,1]
                x = mean_rating_fairness
                dp += abs(G.node[node]['goodness'] - x)
                G.node[node]['goodness'] = x

    ########################## Updating reliability of ratings ##################################  
        print('Updating reliability of ratings')

        for edge in edges:
            user_fairness = G.node[edge[0]]['fairness']  # F(u)
            rating_distance = 1 - (abs(edge[2]['weight'] - G.node[edge[1]]['goodness']) / 2.0)
            # added cold start treatment /not yet added behavior property component
            x = (gamma1 * user_fairness + gamma2 * rating_distance) / (gamma1 + gamma2) 
            x = np.clip(x, 0, 1)  # normalize to [0,1]
        
            dr += abs(edge[2]['fairness'] - x)
            G[edge[0]][edge[1]]['fairness'] = x
            
    ########################## Updating fairness of users ##################################             
        print('Updating fairness of users')

        currentfvals = [G.node[node]['fairness'] for node in nodes if node.startswith('u')]
        # Alternatively, we can use median here, intead of mean
        mean_fvals = np.mean(currentfvals) # mean or median fairness (μf)

        for node in nodes:
            if node.startswith('u'):
                outedges = G.out_edges(node, data=True)
                rating_fairness_sum = np.sum([edge[2]['fairness'] for edge in outedges]) # sum(R(u,p))   
                # added cold start treatment /not yet added behavior property component
                x = (rating_fairness_sum + alpha1*mean_fvals) / (len(outedges) + alpha1)
                x = np.clip(x, 0, 1) # normalize to [0,1]
                du += abs(G.node[node]['fairness'] - x)
                G.node[node]['fairness'] = x

        if du < 0.01 and dp < 0.01 and dr < 0.01:
            break
    ########################################################################################
    
    # SAVE THE RESULT
    result_dir = Path('./data/intermediate/rev2_results/fairness/amazon/')
    if not result_dir.exists():
        result_dir.mkdir(parents=True)        
        
    currentfvals = [G.node[node]['fairness'] for node in nodes if node.startswith('u')]
    # Alternatively, we can use median here, intead of mean
    mean_fvals = np.mean(currentfvals)
    print(len(currentfvals), mean_fvals)

    all_node_vals = []
    for node in nodes:
        if node.startswith('u'):
            f = G.node[node]['fairness']
            all_node_vals.append([node, (f - mean_fvals) * np.log(G.out_degree(node) + 1), f, G.out_degree(node)])        
    
    # sort users based on their scores
    all_node_vals_sorted = sorted(all_node_vals, key=lambda x: (float(x[1]), float(x[2]), -1 * float(x[3])))[::-1]

    #fw = open(result_dir + f"fng-sorted-users-{alpha1}-{alpha2}-{beta1}-{beta2}-{gamma1}-{gamma2}-{gamma3}.csv", "w")
    rows_list = []
    for sl in all_node_vals_sorted:        
        dic = {}
        dic.update({'userId': str(sl[0])}) 
        dic.update({'x': str(sl[1])}) 
        dic.update({'userFairness': str(sl[2])}) 
        dic.update({'userOutEdge': str(sl[3])}) 
        rows_list.append(dic) 
        
#         fw.write(f"{str(sl[0])},{str(sl[1])},{str(sl[2])},{str(sl[3])}\n")
#     fw.close()
    df_result = pd.DataFrame(rows_list, columns=['userId', 'x', 'userFairness', 'userOutEdge']) 
    df_result.to_csv(result_dir / f'fng-sorted-users-{alpha1}-{alpha2}-{beta1}-{beta2}-{gamma1}-{gamma2}-{gamma3}.csv', header=False, index=False) 

In [18]:
### run only one
input_data = 'data/amazon_network.csv'
iterate(input_data, 0, 0, 0, 0, 1, 1, 0)

data/demo/amazon_network.csv has 330317 nodes and 560804 edges
-----------------
Epoch number 0 with du = 0.000000, dp = 0.000000, dr = 0.000000, for (0,0,0,0,1,1,0)
Updating goodness of product
Updating reliability of ratings
Updating fairness of users
-----------------
Epoch number 1 with du = 25864.046562, dp = 30966.507499, dr = 58400.584293, for (0,0,0,0,1,1,0)
Updating goodness of product
Updating reliability of ratings
Updating fairness of users
-----------------
Epoch number 2 with du = 13375.723547, dp = 861.721543, dr = 30194.030413, for (0,0,0,0,1,1,0)
Updating goodness of product
Updating reliability of ratings
Updating fairness of users
-----------------
Epoch number 3 with du = 7208.313851, dp = 1208.663597, dr = 16246.640456, for (0,0,0,0,1,1,0)
Updating goodness of product
Updating reliability of ratings
Updating fairness of users
-----------------
Epoch number 4 with du = 4004.482613, dp = 834.217455, dr = 8985.326284, for (0,0,0,0,1,1,0)
Updating goodness of product
U

In [None]:
### run all
input_data = 'data/amazon_network.csv'
para = range(3)
alpha2 = 0
beta2 = 0
gamma3 = 0
#alpha2, beta2, gamma3 use for behavior property component  (not yet added) ---> BIRDNEST: Bayesian Inference for Ratings-Fraud Detection

for alpha1 in para:
    for beta1 in para:
        for gamma1 in para:
            for gamma2 in para:
                if (gamma1 != 0 or gamma2 != 0 or gamma3 != 0):
                    iterate(input_data, alpha1, alpha2, beta1, beta2, gamma1, gamma2, gamma3)

In [None]:
# for cal mean score (x) after run all
# read all the scores with different parameter combinations
scores = defaultdict(list)
fnames = os.listdir("data/intermediate/rev2_results/fairness/amazon")
for fname in fnames:
    print('Load:',fname)
    f = open("data/intermediate/rev2_results/fairness/amazon/%s" % fname, "r")
    for l in f:
        l = l.strip().split(",")
        if l[1] == "nan":
            continue
        if l[2] == "nan":
            continue
        scores[l[0]].append(float(l[1]))
# combine scores for each node 
uniscores = {}
for score in scores:
    uniscores[score] = np.mean(scores[score])

# sort all nodes based on their scores and store it
sortedlist = sorted(uniscores.items(), key= lambda x: x[1])

fw = open(result / "amazon-mean-scores.csv","w")
for sl in sortedlist:
    fw.write("%s, %f\n" % (sl[0], sl[1]))
fw.close()