### DM end2 problem 3 

#### Import libraries

In [1]:
import numpy as np
import pandas as pd

#### Parameters  

In [2]:
csv_in = 'dm-end2-3.csv'
# min number of common items between target user and users in DB
min_common_items = 4
# min number of users who evaluated an item to be recommended
min_eval_users = 3

# To show more rows and columns
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999 

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
display(df.info())
display(df.head())

(5000, 100)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 100 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ebi             1546 non-null   float64
 1   anago           1535 non-null   float64
 2   maguro          1406 non-null   float64
 3   ika             1457 non-null   float64
 4   uni             1380 non-null   float64
 5   tako            1447 non-null   float64
 6   ikura           1412 non-null   float64
 7   tamago          1353 non-null   float64
 8   toro            1321 non-null   float64
 9   amaebi          1244 non-null   float64
 10  hotategai       1297 non-null   float64
 11  tai             1213 non-null   float64
 12  akagai          1194 non-null   float64
 13  hamachi         1118 non-null   float64
 14  awabi           1112 non-null   float64
 15  samon           1077 non-null   float64
 16  kazunoko        1012 non-null   float64
 17  shako           1016

None

Unnamed: 0,ebi,anago,maguro,ika,uni,tako,ikura,tamago,toro,amaebi,hotategai,tai,akagai,hamachi,awabi,samon,kazunoko,shako,saba,chu_toro,hirame,aji,kani,kohada,torigai,unagi,tekka_maki,kanpachi,mirugai,kappa_maki,geso,katsuo,iwashi,hokkigai,shimaaji,kanimiso,engawa,negi_toro,nattou_maki,sayori,takuwan_maki,botanebi,tobiko,inari,mentaiko,sarada,suzuki,tarabagani,ume_shiso_maki,komochi_konbu,tarako,sazae,aoyagi,toro_samon,sanma,hamo,nasu,shirauo,nattou,ankimo,kanpyo_maki,negi_toro_maki,gyusashi,hamaguri,basashi,fugu,tsubugai,ana_kyu_maki,hiragai,okura,ume_maki,sarada_maki,mentaiko_maki,buri,shiso_maki,ika_nattou,zuke,himo,kaiware,kurumaebi,mekabu,kue,sawara,sasami,kujira,kamo,himo_kyu_maki,tobiuo,ishigakidai,mamakari,hoya,battera,kyabia,karasumi,uni_kurage,karei,hiramasa,namako,shishamo,kaki
0,,0.0,,4.0,2.0,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,4.0,,2.0,,,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,2.0,,,,,,,,,,,,
1,,,,,,,0.0,,1.0,,,,0.0,,,,,0.0,,,,1.0,2.0,,0.0,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,
2,,3.0,4.0,,,,3.0,,,,,,,4.0,,4.0,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,
3,4.0,,,3.0,4.0,1.0,,,4.0,3.0,,,,,,,,,,,,,,,3.0,,,,,2.0,,,,,,,,,,,2.0,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,1.0,,,,,4.0,,,,0.0,,,,,,1.0,,,,,,,,,,,1.0,0.0,,,,,,,,,,,,,,,,4.0,,,3.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0


#### Return Series of sorted similiarity of users  

In [4]:
def get_sim_ser_by_pearson(df_users, ser_target):
    corr_all = df_users.corrwith(ser_target, axis=1).dropna()
    return corr_all

In [5]:
def get_sim_ser_by_pearson2(df_users, ser_target):
    corr_all = df_users.corrwith(ser_target, axis=1).dropna() 
    corr_all = (corr_all+1)/2 
    return corr_all

#### Prediction of scores for the target user  
(for items with no score)  
based on deviation from average  

In [6]:
def predict_scores(df_sim, ser_sim, ser_target):
    ret = {}
    df_sim_ave = df_sim.mean(axis=1)
    #print(df_sim.head())  # debug
    target_ave = ser_target.mean()
    #print(target_ave)  # debug
    for item1 in df_sim.columns:
        if item1 in ser_target.index: continue
        v1 = df_sim[item1]
        if v1.notnull().sum() < min_eval_users: continue
        v1 -= df_sim_ave
        v11 = v1[ v1.notnull() ]
        t11 = ser_sim[ v1.notnull() ]
        pred1 = target_ave + (v11 * t11).sum() / np.abs(t11).sum()
        ret[item1] = pred1
    
    ser_ret = pd.Series(ret)
    
    return ser_ret.sort_values(ascending=False)

#### Function for user-based collaborative filtering.  

arguments: dictionary of scores for the target user  

ex)
get_recomm_by_user_sim({'maguro':1, 'ika':1, 'uni':3,
                        'awabi':4, 'hirame':4, 'aoyagi':4})  
-> return list such as [('akagai', 2.9835603009918303), ('mirugai', 2.945676429588114), ...]

In [7]:
def get_recomm_by_user_sim(df, target_dic):
    ser_target = pd.Series(target_dic)
    df_scores = df[ ser_target.index ]
    n_same_items = df_scores.notnull().sum(axis=1)
    df_common = df_scores[ n_same_items>=min_common_items ]
    ser_sim = get_sim_ser_by_pearson(df_common, ser_target) 
    df_sim = df.loc[ ser_sim.index ]
    recomm = predict_scores(df_sim, ser_sim, ser_target)
    return recomm

#### Do recommendation  

Number of items calculated: 46
Recommendation:
ankimo     4.176725
kohada     3.716200
mirugai    3.598203
shako      3.581244
akagai     3.539989
dtype: float64
Number of items calculated: 73
Recommendation:
toro_samon    3.993705
okura         3.607180
negi_toro     3.338903
tarabagani    3.312345
kanpachi      3.228670
dtype: float64

In [8]:
recomm = get_recomm_by_user_sim(df,
                                {'maguro':1, 'ika':1, 'uni':3,
                                 'awabi':4, 'hirame':4, 'aoyagi':4})
print('Number of items calculated:', len(recomm))
print('Recommendation:')
print(recomm.head())

Number of items calculated: 46
Recommendation:
ankimo     4.176725
kohada     3.716200
mirugai    3.598203
shako      3.581244
akagai     3.539989
dtype: float64
