In [191]:
import pandas as pd
import numpy as np
import os

### file read

In [192]:
filepath = './data/'
trainfile = 'train.dat'
actorfile = 'movie_actors.dat'
directorfile =  'movie_directors.dat'
genrafile = 'movie_genres.dat'
movie_tagfile = 'movie_tags.dat'
tagfile =  'tags.dat'
testfile = 'test.dat'
user_taggedfile = 'user_taggedmovies.dat'

In [193]:
import codecs
def read_data(filepath, filename):
    data = []
    with codecs.open(os.path.join(filepath,filename),'r',encoding = "ISO-8859-1") as ff:
        for line in ff:
            line_data = line.split('\t')
            line_data[-1]=line_data[-1][:-2]
            data.append(line_data)
    return data

df_train = pd.read_csv(os.path.join(filepath,trainfile), sep=" ",engine='python')
df_test = pd.read_csv(os.path.join(filepath,testfile), sep=" ",engine='python')
df_actor = pd.DataFrame.from_records(read_data(filepath, actorfile)[1:],columns=read_data(filepath, actorfile)[0])
df_director = pd.DataFrame.from_records(read_data(filepath, directorfile)[1:],
                                        columns=read_data(filepath, directorfile)[0])
df_genre = pd.read_csv(os.path.join(filepath,genrafile), sep="\t",engine='python')
df_movie_tag = pd.read_csv(os.path.join(filepath,movie_tagfile), sep="\t",engine='python')
df_tag = pd.read_csv(os.path.join(filepath,tagfile), sep="\t",engine='python')
df_test = pd.read_csv(os.path.join(filepath,testfile), sep=" ",engine='python')
df_user_tag = pd.read_csv(os.path.join(filepath,user_taggedfile), sep=" ",engine='python')

### save and load output

In [276]:
import pickle

def save_output(items, item,name):
    if item =="output":
        output = open(os.path.join(filepath,name+'output.dat'), 'w')
        output.writelines( "%.1f\n" % item for item in items )
        output.close()
        
    if item =="dataframe":
        with open(os.path.join(filepath, name+".pickle"),'wb') as trf:
            pickle.dump(items, trf)

def load_items(name):
    with open(os.path.join(filepath, name+".pickle"),'rb') as trf:
        train_df = pickle.load(trf)
    return train_df

In [195]:
train_cols = df_train.shape[0]

### Insert genre, tag information into dataframe

In [196]:
df_data = pd.concat([df_train,df_test])
genre_dict  = dict(zip(df_genre['movieID'],df_genre['genre']))
mtag_dict = dict(zip(df_movie_tag['movieID'],df_movie_tag['tagID']))
utag_dict  = dict(zip(zip(df_user_tag['userID'],df_user_tag['movieID']),df_user_tag['tagID']))
df_data['genre']=df_data.apply(lambda row: genre_dict[row['movieID']], axis=1)
df_data['mtag']=df_data.apply(lambda row: mtag_dict[row['movieID']] if row['movieID'] in mtag_dict
                              else -1, axis=1)
df_data['utag']=df_data.apply(lambda row: utag_dict[(row['userID'], row['movieID'])] 
                              if (row['userID'], row['movieID']) in utag_dict else row['mtag'] , axis=1)

### User and movie Count and sum

In [197]:
df_data['ucount'] = df_data.groupby('userID')['userID'].transform('count')
df_data['mcount'] = df_data.groupby('movieID')['movieID'].transform('count')
df_data['utot'] = df_data.groupby('userID')['rating'].transform('sum')
df_data['mtot'] = df_data.groupby('movieID')['rating'].transform('sum')

In [198]:
df_data[:train_cols]

Unnamed: 0,userID,movieID,rating,genre,mtag,utag,ucount,mcount,utot,mtot
0,75,3,1.0,Romance,13668,13668,46,201,145.5,510.5
1,75,32,4.5,Thriller,15912,15912,46,978,145.5,3551.5
2,75,110,4.0,War,16309,16309,46,1047,145.5,3609.5
3,75,163,4.0,Thriller,15870,15870,46,397,145.5,1210.5
4,75,165,4.5,Thriller,14046,14046,46,668,145.5,2029.5
...,...,...,...,...,...,...,...,...,...,...
641694,71534,42900,4.0,Thriller,15102,15102,142,16,531.0,56.5
641695,71534,44555,4.0,Drama,11508,11508,142,241,531.0,885.5
641696,71534,46578,4.0,Drama,14458,14458,142,618,531.0,2244.0
641697,71534,61075,5.0,Romance,2924,2924,142,12,531.0,40.5


## KNN based approach

### Functions to find top-k similar users/movies using KNN

In [199]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import multiprocessing
from functools import partial

def run_knn(df,primary,secondary):
    r_df = df.pivot(index = primary, columns =secondary, values = 'rating').fillna(0)
    csr_r = csr_matrix(r_df.values)
    model_knn = NearestNeighbors(metric='cosine',algorithm ='auto')
    model_knn.fit(csr_r)
    return r_df, model_knn

def sim_rating(name,i,df, r_df,id_to_x, sim_dict,primary,secondary):
    namet =  name+str(i)
    if name =='su':
        df[namet]=df.apply(lambda row: r_df.loc[id_to_x[sim_dict[row[primary]][i]],
                                                     row[secondary]] if row[secondary] in r_df.columns
                                 else 0,axis=1)
    else:
        df[namet]=df.apply(lambda row: r_df.loc[id_to_x[sim_dict[row[primary]][i]],
                                                     row[secondary]] if row[primary] in r_df.index
                                 else 0,axis=1)        
    print("{} done".format(namet))
    return df


def attribute_based_rating(df,train_cols, primary, secondary, name,neighbors=50):
    df_data = df[:train_cols]
    r_df, model_knn = run_knn(df_data.copy(),primary,secondary)
    sim_attr={}
    id_to_attr = dict(zip(range(len(r_df)),r_df.index))
    for i in r_df.index:
        distances,indices = model_knn.kneighbors(r_df.loc[i,:].values.reshape(1,-1), neighbors)
        sim_attr[i]=indices[0]

    for i in range(1,neighbors):    
        df_data = sim_rating(name,i,df, r_df,id_to_attr, sim_attr,primary,secondary)
    return df_data, r_df

In [200]:
##user-based similarity
df_data, udf = attribute_based_rating(df_data.copy(),train_cols, 'userID', 'movieID','su')
##movie-based similarity
# df_data, rdf = attribute_based_rating(df_data.copy(),train_cols, 'movieID', 'userID','sm')
# save_output(df_data, 'dataframe')

su1 done
su2 done
su3 done
su4 done
su5 done
su6 done
su7 done
su8 done
su9 done
su10 done
su11 done
su12 done
su13 done
su14 done
su15 done
su16 done
su17 done
su18 done
su19 done
su20 done
su21 done
su22 done
su23 done
su24 done
su25 done
su26 done
su27 done
su28 done
su29 done
su30 done
su31 done
su32 done
su33 done
su34 done
su35 done
su36 done
su37 done
su38 done
su39 done
su40 done
su41 done
su42 done
su43 done
su44 done
su45 done
su46 done
su47 done
su48 done
su49 done


### Save dataframe

In [202]:
save_output(df_data, 'dataframe','k-50')

### Load dataframe

In [261]:
df_data = load_items('k-50')

### User, movie and side information (tag, genre) based aggregated rating feature

In [262]:
df_data['uavg']=df_data['utot']/df_data['ucount']
df_data['mavg']=df_data['mtot']/df_data['mcount']

# df_data['genavg'] = df_data.groupby('genre')['rating'].transform('sum')/df_data.groupby(
#     'genre')['rating'].transform('count')
df_data['utagavg'] = df_data.groupby('utag')['rating'].transform('sum')/df_data.groupby(
   'utag')['rating'].transform('count')
df_data['mtagavg'] = df_data.groupby('mtag')['rating'].transform('sum')/df_data.groupby(
    'mtag')['rating'].transform('count')
df_data['utagavg']=df_data.apply(lambda row: row['uavg'] if pd.isna(row['utagavg']) else row['utagavg'],axis=1)
df_data['mtagavg']=df_data.apply(lambda row: row['mavg'] if pd.isna(row['mtagavg']) else row['mtagavg'],axis=1)
# save_output(df_data, 'dataframe')

### Avg top-k (eg. 50) similar user rating column

In [263]:
df_data['sum-50']=df_data.iloc[:,10:].sum(axis=1)/50
df_data = df_data.drop(columns = df_data.columns[10:59])

In [272]:
df_data

Unnamed: 0,userID,movieID,rating,genre,mtag,utag,ucount,mcount,utot,mtot,uavg,mavg,utagavg,mtagavg,sum-50
0,75,3,1.0,Romance,13668,13668,46,201,145.5,510.5,3.163043,2.539801,2.852273,2.851955,0.368141
1,75,32,4.5,Thriller,15912,15912,46,978,145.5,3551.5,3.163043,3.631391,3.970517,3.979054,3.414880
2,75,110,4.0,War,16309,16309,46,1047,145.5,3609.5,3.163043,3.447469,3.824447,3.837864,3.445456
3,75,163,4.0,Thriller,15870,15870,46,397,145.5,1210.5,3.163043,3.049118,3.193428,3.202416,1.562160
4,75,165,4.5,Thriller,14046,14046,46,668,145.5,2029.5,3.163043,3.038174,3.337345,3.341847,2.577608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71294,71534,2208,,Thriller,7713,7713,142,57,531.0,198.0,3.739437,3.473684,3.269643,3.327138,0.446198
71295,71534,2997,,Fantasy,11945,11945,142,864,531.0,3021.5,3.739437,3.497106,3.823755,3.835793,2.707922
71296,71534,4306,,Romance,14562,14562,142,1178,531.0,4064.5,3.739437,3.450340,3.594996,3.601115,2.107718
71297,71534,7132,,Romance,7867,7867,142,92,531.0,336.5,3.739437,3.657609,3.960739,3.960046,0.576357


### train-test data

In [265]:
train_df = df_data[:train_cols].copy()
test_df = df_data[train_cols:].copy()

### K nearest Regressor based prediction

In [280]:
from sklearn.neighbors import KNeighborsRegressor
X = train_df.iloc[:,10:].values
y = train_df['rating'].values
neigh = KNeighborsRegressor(n_neighbors=50)
neigh.fit(X, y)
pred = neigh.predict(test_df.iloc[:,10:].values)

In [281]:
pred

array([3.23, 3.74, 3.08, ..., 4.35, 4.44, 4.19])

### Save output

In [282]:
save_output(pred, 'output','output-k(10)-regressor')

### Feaature - rating correlation

In [266]:
for i,c in enumerate(train_df.columns[4:]):
    print(4+i,c,train_df[c].corr(train_df['rating']))

4 mtag 0.19837299551164753
5 utag 0.176397900659125
6 ucount -0.10250593420650579
7 mcount 0.18063436804712685
8 utot -0.044094588089830744
9 mtot 0.22228566733598384
10 uavg 0.38789782663956357
11 mavg 0.48661838143163166
12 utagavg 0.4178795403777499
13 mtagavg 0.41074522451791895
14 sum-50 0.28407886874928134


## SVD based Approach

In [119]:
def create_matrix(df_data,value):
    r_df = df_data.pivot(index = 'userID', columns ='movieID', values = value).fillna(0)
    users = list(r_df.index)
    users_index = {users[i]: i for i in range(len(users))}
    movies = list(r_df.columns)
    movies_index = {movies[i]: i for i in range(len(movies))}
    return r_df, users_index, movies_index

In [120]:
from scipy.sparse.linalg import svds
from scipy.linalg import sqrtm

def svd(train, k):
    r_mat = train.values
    user_ratings_mean = np.mean(r_mat, axis = 1)
    r_norm = r_mat - user_ratings_mean.reshape(-1, 1)
    
    U, s, V = svds(r_norm, k)
    s = np.diag(s)

    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + user_ratings_mean.reshape(-1, 1)
    print("svd done")
    return UsV

In [121]:
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

In [189]:
# df_data = pd.concat([df_train,df_test])
# to test the performance over a different number of features

no_of_features = [50]
utilMat, users_index, items_index = create_matrix(df_data.copy(),'mfeat')
for f in no_of_features: 
    svdout = svd(utilMat, k=f)
    pred = [] #to store the predicted ratings
    for _,row in df_test.iterrows():
        user = row['userID']
        item = row['movieID']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)

# print(rmse(df_test['rating'], pred))

svd done


In [190]:
save_output(pred, 'output')