In [15]:
import pandas as pd

list_item = []
for i in range(3):
    filepath = "/data/fjsdata/nursereport/item"+str(i+1)+".csv";
    with open(filepath, "r", encoding="gbk") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split(",")
            if len(arr) >1 and arr[1]!="":
                lel1, lel2 = arr[0], arr[1][:-1]#cut off the last string,\n
                list_item.append([lel1, lel2])
            line = f.readline()
df_item = pd.DataFrame(list_item, columns=['L1','L2'])
print (df_item['L1'].unique())

['Ⅰ级护理' 'Ⅱ级护理' '一级护理' '特级护理' 'Ⅲ级护理' '二级护理' '' '病危' '手术' '出院' '入院' '护理级别'
 '转入' '转出' '病情' '术后护理' '体温Q4H' '护理记录单(续页)' '产科护理记录(续页)' '护理记录(续页)' '体温图'
 '长期医嘱' '临时医嘱']


In [33]:
'''
Created on 2019.6.19
@author: Jason.F
@summary:
SVDTrain.py:Training the SVD model.
Dependencies: python3.x, numpy, pandas, surprise, sklearn. you can install their by pip tool.
Input: The format is: userid, itemid, rating, among them the rating denotes behavior records of nurse on items.
       the datatype of userid and itemid is int, the number range from zero to max of users and items. The datatype of rating is float or int, such as 2.5, 3.
Output: The learned SVD model which can recommend topk items to nurse based on the collaborative filtering.
Usage: python SVDTrain.py --dataPath /data/fjsdata/nursereport/ui.rating --modelPath /data/fjsdata/nursereport/svd.model
'''
import pandas as pd
import numpy as np
import surprise as sp
import time
import argparse
import math
import os
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def parse_args():#define the paramter of program
    parser = argparse.ArgumentParser(description="Run SVD.")
    parser.add_argument('--dataPath', nargs='?', default='/data/fjsdata/nursereport/ui.rating',
                        help='Data path of training file.')
    parser.add_argument('--modelPath', nargs='?', default='/data/fjsdata/nursereport/svd.model',
                        help='Data path of saving model.')
    return parser.parse_args(args=[])

def load_data(filepath):#read file
    list_rating =[]
    with open(filepath, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item, rating = int(arr[0]), int(arr[1]), int(arr[2])
            list_rating.append([user, item, rating])
            line = f.readline()
    df_rating = pd.DataFrame(list_rating, columns=['u','i','r'])
    #normalize the rating in the range[0,1]
    num_max=df_rating['r'].max()
    num_min=df_rating['r'].min()
    df_rating['r']=df_rating['r'].apply(lambda x: (x-num_min+1)*1.0/(num_max-num_min+1) )
    return df_rating

def calc_dcg(items):#calculate DCG and IDCG
    dcg = 0
    i = 0
    for item in items:
        i += 1
        dcg += (math.pow(2, item) - 1)/ math.log(1 + i, 2)
    return dcg

def index_at_k(predictions, k, threshold=0.1):
   #Return precision and recall at k metrics for each user.
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    ndcgs =dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r > threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est > threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r > threshold) and (est > threshold)) for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        #true ratings of recommended items in top k
        l_rec_k = [true_r for (_,true_r) in user_ratings[:k]]
        dcg = calc_dcg(l_rec_k)
        #l_rec_k.sort(reverse=True)
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        l_rel_k = [true_r for (_,true_r) in user_ratings[:k]]
        idcg = calc_dcg(l_rel_k)
        ndcgs[uid]=dcg*1.0/idcg 
    return precisions, recalls, ndcgs
      
if __name__ == '__main__':
    #1.load data with parameter,dataPath and modelPath
    args = parse_args()
    dataPath = args.dataPath
    modelPath = args.modelPath
    df_rating = load_data(dataPath)
    print ('Dataset has loaded and its shape is:%d rows and %d columns'%(df_rating.shape[0],df_rating.shape[1]))
    #2.Transforming into data format of surprise and spliting the train-set and test-set
    # The columns must correspond to user id, item id and ratings (in that order).
    reader = sp.Reader(rating_scale=(0, 1))
    spdata = sp.Dataset.load_from_df(df_rating[['u', 'i', 'r']],reader)
    # sampling random trainset and testset
    trainset = spdata.build_full_trainset()
    testset = trainset.build_testset()
    #3.Training the model and predicting ratings for the testset
    st = time.time()
    algo = sp.SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    et =time.time()
    print ('Model has trained successfully in %s seconds!'%(et - st))
    
    #4.measure the model
    print ("RMSE:%0.8f" % (sp.accuracy.rmse(predictions)))
    print ("%3s%20s%20s%20s" % ('K','Precisions','Recalls','NDCG'))
    for k in [5,10,15,20]:#latent factor
        precisions, recalls, ndcgs = index_at_k(predictions, k=k)
        # Precision and recall can then be averaged over all users
        precision = sum(prec for prec in precisions.values()) / len(precisions)
        recall = sum(rec for rec in recalls.values()) / len(recalls)
        ndcg = sum(ndcg for ndcg in ndcgs.values()) / len(ndcgs)
        print ("%3s%20.8f%20.8f%20.8f" % (k, precision, recall, ndcg))
    
    #5.save the model
    file_name = os.path.expanduser(modelPath)
    sp.dump.dump(file_name, predictions=predictions,  algo=algo)# Dump algorithm
    print ("The model has saved successfully in the path:%s" % file_name)

Dataset has loaded and its shape is:994169 rows and 3 columns
Model has trained successfully in 59.2490177154541 seconds!
RMSE: 0.1752
RMSE:0.17521663
  K          Precisions             Recalls                NDCG
  5          1.00000000          0.07833852          0.87703531
 10          1.00000000          0.15667703          0.87319517
 15          1.00000000          0.23501555          0.87449026
 20          1.00000000          0.31260468          0.87931140
The model has saved successfully in the path:/data/fjsdata/nursereport/svd.model


In [92]:
'''
Created on 2019.6.19
@author: Jason.F
@summary:
SVDRecommend.py: Reommend TopK items to user.
Dependencies: python3.x, numpy, pandas, surprise, sklearn. you can install their by pip tool.
Input: urdList, the format is one uid per line. the datatype is int.
Output: urdList, topk items for specific userid. The format of every line is:uid [itemid1,itemid2,...,itemidk]
Usage: python SVDRecommend.py --TopK 5 --modelPath /data/fjsdata/nursereport/svd.model --uidPath /data/fjsdata/nursereport/uid.list
'''
import pandas as pd
import numpy as np
import surprise as sp
import time
import argparse
import math
import os
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

def parse_args():#define the paramter of program
    parser = argparse.ArgumentParser(description="Run SVD.")
    parser.add_argument('--TopK', nargs='?', default=10, help='Recommend K items')
    parser.add_argument('--modelPath', nargs='?', default='/data/fjsdata/nursereport/svd.model',
                        help='Data path of saving model.')
    parser.add_argument('--uidPath', nargs='?', default='/data/fjsdata/nursereport/uid.list',
                        help='The uid will be recommended.')
    return parser.parse_args(args=[])

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def load_data(filepath):#read file
    list_uid =[]
    with open(filepath, "r") as f:
        line = f.readline()
        while line != None and line != "":
            list_uid.append(line.strip())
            line = f.readline()
    return list_uid

def write_data(filepath, iidList):#write file
    with open(filepath,"w") as f:
        for iid in iidList:
            f.write(str(iid[0])+" ["+",".join(str(x) for x in iid[1])+"]")
            f.write('\n')
        f.close()

if __name__ == '__main__':
    #1.load data with parameter,dataPath and modelPath
    args = parse_args()
    topK = args.TopK
    modelPath = args.modelPath
    uidPath =args.uidPath
    #2.load the model
    predictions, algo = sp.dump.load(modelPath)
    print ("The model has loaded successfully from the path:%s" % modelPath)
    #3.get the topk items
    top_n = get_top_n(predictions, n=topK)
    #4.recommended items for each user
    uidList = load_data(uidPath)
    iidList = []
    for uid in uidList:
        user_ratings = top_n.get(int(uid))
        if user_ratings!=None:
            iid_rec = [int(uid), [iid for (iid, _) in user_ratings]]
            iidList.append(iid_rec)
        else:
            iid_rec = [int(uid),[]]
            iidList.append(iid_rec)
    #5.output the results of recommendation.
    write_data(uidPath,iidList)
    print ("Complete recommendation.")

The model has loaded successfully from the path:/data/fjsdata/nursereport/svd.model
Complete recommendation.
