# Imports

In [205]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, pairwise_distances
from surprise import Reader, Dataset, SVD, SVDpp, NormalPredictor, accuracy
from surprise.model_selection import cross_validate, GridSearchCV, KFold
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
import pickle

# Load Dataframes

In [191]:
movies = pd.read_csv('../data/preprocessed/movies.csv',index_col=0)
movies_reduced = pd.read_csv('../data/preprocessed/movies_reduced.csv',index_col=0)
df_ratings = pd.read_csv('../data/preprocessed/df_ratings.csv',index_col=0)
train_data = pd.read_csv('../data/preprocessed/train_data.csv',index_col=0)
test_data = pd.read_csv('../data/preprocessed/test_data.csv',index_col=0)
sim_meta_input = pd.read_csv('../data/preprocessed/sim_meta_input.csv',index_col=0)
sim_meta_input_test = pd.read_csv('../data/preprocessed/sim_meta_input_test.csv',index_col=0)
sim_meta_input_all = pd.read_csv('../data/preprocessed/sim_meta_input_all.csv',index_col=0)
sim_meta_input_test_all = pd.read_csv('../data/preprocessed/sim_meta_input_test_all.csv',index_col=0)
svd_meta_input = pd.read_csv('../data/preprocessed/svd_meta_input.csv',index_col=0)
svd_meta_input_test = pd.read_csv('../data/preprocessed/svd_meta_input_test.csv',index_col=0)

svd = pickle.load(open('svd_model', 'rb'))

## Get Recommendation based on movieID

In [137]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(movies_reduced['all_in_all'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
indices = pd.Series(movies_reduced.index, index=movies_reduced['movieID'])

In [138]:
def get_recommendations(movieID):
    idx = indices[movieID]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    movie_indices = [i[0] for i in sim_scores]
    scores = pd.DataFrame(sim_scores, columns = ['index', 'score'])
    movies_with_sim_score = scores.merge(movies_reduced, on ='index' )
    movies_with_sim_score.drop(['index', 'all_in_all'], axis = 1, inplace = True)
    # Um die Liste zu verkürzen werden nur Filme zurückgegeben mit einem Score über 0.35
    return movies_with_sim_score[movies_with_sim_score['score'] > 0.35]

In [41]:
def get_recommendations_low_score(movieID):
    idx = indices[movieID]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    movie_indices = [i[0] for i in sim_scores]
    scores = pd.DataFrame(sim_scores, columns = ['index', 'score'])
    movies_with_sim_score = scores.merge(movies_reduced, on ='index' )
    movies_with_sim_score.drop(['index', 'all_in_all'], axis = 1, inplace = True)
    # Um die Liste zu verkürzen werden nur Filme zurückgegeben mit einem Score über 0.3
    return movies_with_sim_score[movies_with_sim_score['score'] > 0.3]

In [52]:
def get_recommendations_all(movieID):
    idx = indices[movieID]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    movie_indices = [i[0] for i in sim_scores]
    scores = pd.DataFrame(sim_scores, columns = ['index', 'score'])
    movies_with_sim_score = scores.merge(movies_reduced, on ='index' )
    movies_with_sim_score.drop(['index', 'all_in_all'], axis = 1, inplace = True)
    return movies_with_sim_score

In [103]:
def get_relevant_movies(user_id, movieID):
    # Überprüfe ob die imdbID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id'] == user_id], on = 'movieID')
        return merged_recommendations
    # Ansonsten gebe durchschnittliches rating zurück
    else:
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean()

In [139]:
get_recommendations(1)

Unnamed: 0,score,movieID,title
0,1.0,3114,Toy Story
1,0.786456,4929,Toy Story
2,0.566982,2355,A Bug's Life
3,0.553041,45517,Cars
4,0.433574,2294,Antz
5,0.400222,32031,"I, Robot"
6,0.395132,588,Aladdin
7,0.394665,673,Space Jam
8,0.387097,4306,Shrek
9,0.387097,47124,The Ant Bully


In [140]:
get_relevant_movies(1264, 1210)

Unnamed: 0,score,movieID,title,user_id,rating
0,0.611347,1196,Star Wars: Episode V - The Empire Strikes Back,1264,5.0
1,0.427394,260,Star Wars,1264,5.0
2,0.416828,2628,Star Wars: Episode I - The Phantom Menace,1264,4.0
3,0.405405,5378,Star Wars: Episode II - Attack of the Clones,1264,3.0
4,0.359447,33493,Star Wars: Episode III - Revenge of the Sith,1264,4.0


## Rating prediction - surprise library

### Matrix Factorization-based algorithm (SVD) https://surprise.readthedocs.io/en/stable/matrix_factorization.html

In [185]:
reader = Reader(rating_scale=(0.5, 5))
trainset = Dataset.load_from_df(train_data[['user_id', 'movieID', 'rating']], reader)
trainset = trainset.build_full_trainset()
y_train = train_data[['rating']]
x_test = test_data[['user_id', 'movieID']]
y_test = test_data[['rating']]

## Search for similar movies of a user

In [151]:
# alte Methode, wird nicht verwendet
def get_relevant_movies(user_id, movieID):
    # Überprüfe ob die imdbID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id'] == user_id], on = 'movieID')
        return merged_recommendations
    # Ansonsten gebe durchschnittliches rating zurück
    else:
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean()

In [152]:
movies_reduced[movies_reduced['movieID']==1210]

Unnamed: 0,index,movieID,title,imdbID,all_in_all
1097,1097,1210,Star Wars: Episode VI - Return of the Jedi,86190,Star Wars: Episode VI - Return of the Jedi 861...


In [None]:
get_relevant_movies(1264, 1)
#print(get_relevant_movies(1264, 114709))

In [35]:
def calculate_rating(user_id, movieID, no_movieID, no_rel_movies): 
    weighted_rating = 0
    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten gebe durchschnittliches Rating zurück
    else:
        no_movieID += 1
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean(), no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> durchschnittliches Rating zurückgeben
    if merged_recommendations.empty:
        no_rel_movies +=1
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean(),no_movieID, no_rel_movies
    else:
        merged_sum = merged_recommendations['score'].sum()
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (score / merged_sum)
        return weighted_rating, no_movieID, no_rel_movies

In [36]:
def calculate_rating_three(user_id, movieID, no_movieID, no_rel_movies): 
    weighted_rating = 0
    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten gebe durchschnittliches Rating zurück
    else:
        no_movieID += 1
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean(), no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> durchschnittliches Rating zurückgeben
    if merged_recommendations.empty:
        no_rel_movies +=1
        return df_ratings[df_ratings['movieID'] == movieID]['rating'].mean(),no_movieID, no_rel_movies
    else:
        merged_sum = merged_recommendations['score'].pow(3).sum()
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (pow(score,3) / merged_sum)
        return weighted_rating, no_movieID, no_rel_movies

In [37]:
def calculate_rating_without_mean(user_id, movieID, no_movieID, no_rel_movies): 
    weighted_rating = 0
    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten gebe -1 zurück
    else:
        no_movieID += 1
        return -1, no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> -1 zurückgeben
    if merged_recommendations.empty:
        no_rel_movies +=1
        return -1,no_movieID, no_rel_movies
    else:
        merged_sum = merged_recommendations['score'].sum()
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (score / merged_sum)
        return weighted_rating, no_movieID, no_rel_movies

In [148]:
def calculate_rating_with_score(user_id, movieID, no_movieID, no_rel_movies): 
    weighted_rating = 0
    weighted_rating3 = 0
    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten gebe durchschnittliches Rating mit Score=0 zurück
    else:
        no_movieID += 1
        return -1, 0, -1, 0, no_movieID, no_rel_movies
    
    # falls keine relevanten Filme gefunden wurden -> sim_rating=-1 und score=0 zurückgeben
    if merged_recommendations.empty:
        no_rel_movies +=1
        return -1, 0, -1, 0, no_movieID, no_rel_movies

    else:
        merged_sum = merged_recommendations['score'].sum()
        merged_sum3 = merged_recommendations['score'].pow(3).sum()
        mean_score = merged_recommendations['score'].mean()
        mean_score3 = merged_recommendations['score'].pow(3).mean()
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (score / merged_sum)
            weighted_rating3 += rating * (pow(score,3) / merged_sum3)
        return weighted_rating, mean_score, weighted_rating3, mean_score3, no_movieID, no_rel_movies

In [147]:
def calculate_rating_all_with_score(user_id, movieID, no_movieID, no_rel_movies): 
    weighted_rating = 0
    weighted_rating3 = 0
    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations_all(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten gebe -1 mit Score=0 zurück
    else:
        no_movieID += 1
        return -1, 0, -1, 0, no_movieID, no_rel_movies

  # falls keine relevanten Filme gefunden wurden -> sim_rating=-1 und score=0 zurückgeben
    if merged_recommendations.empty:
        no_rel_movies +=1
        return -1, 0, -1, 0, no_movieID, no_rel_movies

    else:
        merged_sum = merged_recommendations['score'].sum()
        merged_sum3 = merged_recommendations['score'].pow(3).sum()
        mean_score = merged_recommendations['score'].mean()
        mean_score3 = merged_recommendations['score'].pow(3).mean()
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            if merged_sum == 0:
                weighted_rating += 0
                weighted_rating3 += 0
            else:
                weighted_rating += rating * (score / merged_sum)
                weighted_rating3 += rating * (pow(score,3) / merged_sum3)
        return weighted_rating, mean_score, weighted_rating3, mean_score3, no_movieID, no_rel_movies

In [322]:
print('Sim Prediction:', calculate_rating_with_score(1264, 1210, 0, 0))
print('SVD Prediction:', svd.predict(1264, 1210).est)
print('True Rating:', df_ratings.loc[(df_ratings['user_id'] == 1264) & (df_ratings['movieID'] == 1210), 'rating'].iloc[0])

Sim Prediction: (3.384646965752884, 0.06614887630891819, 3.625351320288023, 0.0016639520359948803, 0, 0)
SVD Prediction: 3.9989866840483503
True Rating: 5.0


In [66]:
def combine_rating(svd_est, user_id, movieID, no_movieID, no_rel_movies):

    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten SVD-prediction zurückgeben
    else:
        no_movieID += 1
        return svd_est, no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> SVD-prediction zurückgeben um sie nicht zu beeinflussen
    if merged_recommendations.empty:
        no_rel_movies +=1
        return svd_est, no_movieID, no_rel_movies
    else:
        weighted_rating = 0
        merged_sum = merged_recommendations['score'].sum()
        mean_similarity = merged_recommendations['score'].mean()
            
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (score / merged_sum)
            
        result = (svd_est + mean_similarity * weighted_rating) / (1 + mean_similarity)
        return result, no_movieID, no_rel_movies

In [67]:
def combine_rating_square(svd_est, user_id, movieID, no_movieID, no_rel_movies):

    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten SVD-prediction zurückgeben
    else:
        no_movieID += 1
        return svd_est, no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> SVD-prediction zurückgeben um sie nicht zu beeinflussen
    if merged_recommendations.empty:
        no_rel_movies +=1
        return svd_est, no_movieID, no_rel_movies
    else:
        weighted_rating = 0
        merged_sum = merged_recommendations['score'].pow(2).sum()
        mean_similarity = merged_recommendations['score'].mean()
            
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (pow(score,2) / merged_sum)
            
        result = (svd_est + mean_similarity * weighted_rating) / (1 + mean_similarity)
        return result, no_movieID, no_rel_movies

In [68]:
def combine_rating_three(svd_est, user_id, movieID, no_movieID, no_rel_movies):

    # Überprüfe ob die movieID überhaupt in der Filmliste ist
    if movieID in movies_reduced['movieID'].values:
        total_recommendations = get_recommendations(movieID)
        merged_recommendations = total_recommendations.merge(df_ratings[df_ratings['user_id']
                                                                        == user_id], on = 'movieID')
    # Ansonsten SVD-prediction zurückgeben
    else:
        no_movieID += 1
        return svd_est, no_movieID, no_rel_movies

    # falls keine relevanten Filme gefunden wurden -> SVD-prediction zurückgeben um sie nicht zu beeinflussen
    if merged_recommendations.empty:
        no_rel_movies +=1
        return svd_est, no_movieID, no_rel_movies
    else:
        weighted_rating = 0
        merged_sum = merged_recommendations['score'].pow(3).sum()
        mean_similarity = merged_recommendations['score'].mean()
            
        for rating, score in zip(merged_recommendations['rating'], merged_recommendations['score']):
            weighted_rating += rating * (pow(score,3) / merged_sum)
            
        result = (svd_est + mean_similarity * weighted_rating) / (1 + mean_similarity)
        return result, no_movieID, no_rel_movies

In [305]:
y_pred_sim = []
no_movieID = 0
no_rel_movies = 0
for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
    sim_rating, no_movieID, no_rel_movies = calculate_rating_three(user_id, movieID, no_movieID, no_rel_movies)
    y_pred_sim.append(sim_rating)
    
print('RMSE nur über Similarity: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_sim))))
print('Prozent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/len(test_data))*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/len(test_data))*100))

RMSE nur über Similarity: 0.9167
Prozent der Filme für die keine ähnlichen gefunden werden konnten: 52.23%
Prozent der Filme für die keine movieID vorhanden ist: 0.00%


## Test hybrid model

In [146]:
y_pred_svd = []
y_pred_hybrid = []
no_movieID = 0
no_rel_movies = 0
y_pred_hybrid2 = []
y_pred_hybrid3 = []
x=0

for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
    pred = svd.predict(user_id, movieID)
    combined_rating, no_movieID, no_rel_movies = combine_rating(pred.est, user_id, movieID, no_movieID, no_rel_movies)
    combined_rating2, x, x = combine_rating_square(pred.est, user_id, movieID, no_movieID, no_rel_movies)
    combined_rating3, x, x = combine_rating_three(pred.est, user_id, movieID, no_movieID, no_rel_movies)
    y_pred_svd.append(pred.est)
    y_pred_hybrid.append(combined_rating)
    y_pred_hybrid2.append(combined_rating2)
    y_pred_hybrid3.append(combined_rating3)
    
print('RMSE nur SVD: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_svd))))
#print('RMSE nur über Similarity: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_sim))))
print('RMSE hybrides Modell: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_hybrid))))
print('RMSE hybrides Modell quadriert: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_hybrid2))))
print('RMSE hybrides Modell hoch 3: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_hybrid3))))
print('\nProzent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/len(test_data))*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/len(test_data))*100))

RMSE nur SVD: 0.7529
RMSE hybrides Modell: 0.7521
RMSE hybrides Modell quadriert: 0.7514
RMSE hybrides Modell hoch 3: 0.7510

Prozent der Filme für die keine ähnlichen gefunden werden konnten: 47.74%
Prozent der Filme für die keine movieID vorhanden ist: 0.00%


In [78]:
result = pd.DataFrame({'true': y_test['rating'], 'svd_pred': y_pred_svd, 'sim_pred':y_pred_sim, 'hybrid_pred': y_pred_hybrid})
result

Unnamed: 0,true,svd_pred,sim_pred,hybrid_pred
323552,5.0,4.519411,3.780894,4.519411
28381,3.0,4.080826,3.000000,3.540413
75610,4.5,3.931461,3.637838,3.931461
284365,1.0,2.471049,2.000000,2.471049
333270,4.5,3.592356,3.974026,3.592356
...,...,...,...,...
380854,5.0,3.101078,2.962963,3.101078
185742,3.5,3.030143,4.335665,3.379315
430675,1.0,2.893375,2.958633,2.893375
728556,4.5,3.873509,4.000000,3.908403


# Build different Meta Models

In [172]:
# Get mean rating for each movie
temp = pd.DataFrame()
temp['rowID'] = (svd_meta_input['user_id'].astype(str) + '000' +
                                   svd_meta_input['movieID'].astype(str)).astype(int)
mean_rating = df_ratings.groupby(['movieID']).mean()
mean_train = svd_meta_input.merge(mean_rating, on = 'movieID')
mean_test = test_data.merge(mean_rating, on = 'movieID')
mean_train = temp.merge(mean_train, on = 'rowID')
mean_test = test_data.merge(mean_test, on = 'rowID')

## SVD_Prediction, Sim_prediction (missing movies with -1), Sim_score, Mean_rating

In [192]:
meta_train_default_score_mean = {'svd_prediction': svd_meta_input['predicted_rating'], 
                                'similarity_rating': sim_meta_input['sim_pred3'], 
                                 'score_train': sim_meta_input['sim_score'], 
                                 'mean_train': mean_train['rating_y']}
meta_train_default_score_mean = pd.DataFrame(meta_train_default_score_mean)
meta_test_default_score_mean = {'svd_prediction_test': svd_meta_input_test['0'], 
                                'similarity_rating_test': sim_meta_input_test['sim_pred3'], 
                                'score_test': sim_meta_input_test['sim_score'],
                               'mean_test': mean_test['rating_y']}
meta_test_default_score_mean = pd.DataFrame(meta_test_default_score_mean)

#meta_train_default_score_mean.to_csv('../data/preprocessed/meta_train_default_score_mean.csv')
#meta_test_default_score_mean.to_csv('../data/preprocessed/meta_test_default_score_mean.csv')

In [196]:
kn = KNeighborsRegressor(n_neighbors = 765)
kn.fit(meta_train_default_score_mean, y_train)
y_pred_kn = kn.predict(meta_test_default_score_mean)
print('RMSE on Meta Model 1 with Nearest Neighbor: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_kn))))

RMSE on Meta Model 1 with Nearest Neighbor: 0.7428


## SVD_Prediction, Sim_prediction (missing movies with SVD_Prediction), Mean_rating

In [197]:
meta_train_svd_mean = meta_train_default_score_mean.copy()
meta_test_svd_mean = meta_test_default_score_mean.copy()

In [198]:
index = 0
for rating, user_id, movieID in zip(meta_train_svd_mean['similarity_rating'], 
                                    svd_meta_input['user_id'], svd_meta_input['movieID']):
    if rating == -1:
        pred = svd.predict(user_id, movieID)
        meta_train_svd_mean.at[index, 'similarity_rating'] = pred.est
    index += 1
meta_train_svd_mean.drop(['score_train'], axis = 1, inplace = True)

In [199]:
index = 0
for rating, user_id, movieID in zip(meta_test_svd_mean['similarity_rating_test'], 
                     
                                    test_data['user_id'], test_data['movieID']):
    if rating == -1:
        pred = svd.predict(user_id, movieID)
        meta_test_svd_mean.at[index, 'similarity_rating_test'] = pred.est
    index += 1
meta_test_svd_mean.drop(['score_test'], axis = 1, inplace = True)

In [200]:
kn2 = KNeighborsRegressor(n_neighbors = 765)
kn2.fit(meta_train_svd_mean, y_train)
y_pred_kn2 = kn2.predict(meta_test_svd_mean)
print('RMSE on Meta Model 2 with Nearest Neighbor: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_kn2))))

RMSE on Meta Model 2 with Nearest Neighbor: 0.7573


In [None]:
meta_train_svd_mean.to_csv(
    '../data/preprocessed/meta_train_svd_mean.csv')
meta_test_svd_mean.to_csv(
    '../data/preprocessed/meta_test_svd_mean.csv')

## SVD_Prediction, Sim_prediction aller Filme, Sim_score, Mean_rating

In [218]:
meta_train_all_score_mean = {'svd_prediction': svd_meta_input['predicted_rating'], 
                                'similarity_rating': sim_meta_input_all['sim_pred3'], 
                                 'score_train': sim_meta_input_all['sim_score3'], 
                                 'mean_train': mean_train['rating_y']}
meta_train_all_score_mean = pd.DataFrame(meta_train_all_score_mean)
meta_test_all_score_mean = {'svd_prediction_test': svd_meta_input_test['0'], 
                                'similarity_rating_test': sim_meta_input_test_all['sim_pred3'], 
                                'score_test': sim_meta_input_test_all['sim_score3'],
                               'mean_test': mean_test['rating_y']}
meta_test_all_score_mean = pd.DataFrame(meta_test_all_score_mean)

#meta_train_all_score_mean.to_csv('../data/preprocessed/meta_train_all_score_mean.csv')
#meta_test_all_score_mean.to_csv('../data/preprocessed/meta_test_all_score_mean.csv')

scaler = MinMaxScaler(feature_range = (0.5,5))
meta_train_all_score_mean['score_train'] = scaler.fit_transform(meta_train_all_score_mean.loc[:,['score_train']])
meta_test_all_score_mean['score_test'] = scaler.transform(meta_test_all_score_mean.loc[:,['score_test']])

In [221]:
meta_train_all_score_mean

Unnamed: 0,svd_prediction,similarity_rating,score_train,mean_train
0,3.931927,3.631255,0.000489,4.005611
1,4.058450,4.031102,0.001957,3.759036
2,4.272626,4.317795,0.002181,4.098712
3,3.387400,3.741454,0.004418,3.099467
4,4.344191,3.623161,0.005589,3.860738
...,...,...,...,...
804684,2.586866,2.890961,0.000841,3.170431
804685,4.689347,4.214916,0.003393,4.098074
804686,3.532893,4.062191,0.002198,3.471545
804687,3.604540,3.482007,0.001847,3.975000


In [219]:
kn3 = KNeighborsRegressor(n_neighbors = 765)
kn3.fit(meta_train_all_score_mean, y_train)
y_pred_kn3 = kn3.predict(meta_test_all_score_mean)
print('RMSE on Meta Model 3 with Nearest Neighbor: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_kn3))))

RMSE on Meta Model 3 with Nearest Neighbor: 0.7412


In [223]:
pickle.dump(kn3, open('meta_model_7412', 'wb'))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(rf, param_distributions = random_grid, n_iter = 100,
                               cv = 3, random_state=42, n_jobs = -1, verbose = 2)
rf_random.fit(meta_train_all_score_mean, y_train)
y_pred_random = rf_random.predict(meta_test_all_score_mean)
print('RMSE on Meta Model with best Random Forest: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_random))))
print('Best parameters for Nearest Neighbor:', rf_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


# Not needed (only run once)

In [25]:
# Get 1% global hold-out set for testing
train_data, test_data = train_test_split(df_ratings, test_size=0.01, random_state = 42)

train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

train_data['rowID'] = (train_data['user_id'].astype(str) + '000' +
                                   train_data['movieID'].astype(str)).astype(int)
test_data['rowID'] = (test_data['user_id'].astype(str) + '000' +
                                   test_data['movieID'].astype(str)).astype(int)

#train_data.to_csv('../data/preprocessed/train_data.csv')
#test_data.to_csv('../data/preprocessed/test_data.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [26]:
# TODO
#svd = SVD()
#svd.fit(trainset)
y_pred_svd = []

for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
    pred = svd.predict(user_id, movieID)
    y_pred_svd.append(pred.est)
    
print('RMSE nur SVD: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred_svd))))
#pickle.dump(svd, open('svd_model', 'wb'))

RMSE nur SVD: 0.7529


In [189]:
y_pred_svd = pd.DataFrame(y_pred_svd)
y_pred_svd.to_csv('../data/preprocessed/svd_meta_input_test.csv')

In [27]:
#### Get input for Meta Model
kf = KFold(n_splits=20)
svd_fold = SVD()
y_pred_svd = []
user_id_arr = []
movieID_arr = []
train_data_fold = Dataset.load_from_df(train_data[['user_id', 'movieID', 'rating']], reader)

for trainset, testset in kf.split(train_data_fold):
    y_test_svd = []
    svd_fold.fit(trainset)
    predictions_svd = svd_fold.test(testset)
    for pred in predictions_svd:
        y_pred_svd.append(pred.est)
        user_id_arr.append(pred.uid)
        movieID_arr.append(pred.iid)
    
    for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
        pred = svd_fold.predict(user_id, movieID)
        y_test_svd.append(pred.est)
    print('SVD RMSE on Testdata: {:.4f}'.format(sqrt(mean_squared_error(y_test, y_test_svd))))

SVD RMSE on Testdata: 0.7520
SVD RMSE on Testdata: 0.7512
SVD RMSE on Testdata: 0.7588
SVD RMSE on Testdata: 0.7585
SVD RMSE on Testdata: 0.7548
SVD RMSE on Testdata: 0.7548
SVD RMSE on Testdata: 0.7579
SVD RMSE on Testdata: 0.7558
SVD RMSE on Testdata: 0.7555
SVD RMSE on Testdata: 0.7543
SVD RMSE on Testdata: 0.7534
SVD RMSE on Testdata: 0.7533
SVD RMSE on Testdata: 0.7550
SVD RMSE on Testdata: 0.7525
SVD RMSE on Testdata: 0.7579
SVD RMSE on Testdata: 0.7532
SVD RMSE on Testdata: 0.7551
SVD RMSE on Testdata: 0.7549
SVD RMSE on Testdata: 0.7513
SVD RMSE on Testdata: 0.7572


In [28]:
#### Sortiere die Predictions anhand der rowID, sprich wieder wie train_data
sorting_data = pd.DataFrame({'user_id':user_id_arr, 'movieID':movieID_arr, 'predicted_rating':y_pred_svd})
sorting_data['rowID'] = (sorting_data['user_id'].astype(str) + '000' + sorting_data['movieID'].astype(str)).astype(int)
sorting_data.drop(['user_id', 'movieID'], axis=1, inplace=True)
df_sorted = train_data.merge(sorting_data, on = 'rowID')
df_sorted.to_csv('../data/preprocessed/svd_meta_input.csv')

## Get sim rating and score for train_data and test_data 

In [159]:
y_pred_sim_train_with_score = []
y_pred_sim_train_with_score3 = []
score_arr = []
score_arr3 = []
no_movieID = 0
no_rel_movies = 0
data = len(train_data)
index = 0

for user_id, movieID in zip(train_data['user_id'], train_data['movieID']):
    sim_rating, score, sim_rating3, score3, no_movieID, no_rel_movies = calculate_rating_with_score(user_id, movieID, no_movieID, no_rel_movies)
    y_pred_sim_train_with_score.append(sim_rating)
    y_pred_sim_train_with_score3.append(sim_rating3)
    score_arr.append(score)
    score_arr3.append(score3)
    index += 1
    if index%5000 == 0:
        print('Fortschritt: {:.2f}%'.format((index/data)*100))

print('\nProzent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/data)*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/data)*100))

Fortschritt: 0.62%
Fortschritt: 1.24%
Fortschritt: 1.86%
Fortschritt: 2.49%
Fortschritt: 3.11%
Fortschritt: 3.73%
Fortschritt: 4.35%
Fortschritt: 4.97%
Fortschritt: 5.59%
Fortschritt: 6.21%
Fortschritt: 6.83%
Fortschritt: 7.46%
Fortschritt: 8.08%
Fortschritt: 8.70%
Fortschritt: 9.32%
Fortschritt: 9.94%
Fortschritt: 10.56%
Fortschritt: 11.18%
Fortschritt: 11.81%
Fortschritt: 12.43%
Fortschritt: 13.05%
Fortschritt: 13.67%
Fortschritt: 14.29%
Fortschritt: 14.91%
Fortschritt: 15.53%
Fortschritt: 16.16%
Fortschritt: 16.78%
Fortschritt: 17.40%
Fortschritt: 18.02%
Fortschritt: 18.64%
Fortschritt: 19.26%
Fortschritt: 19.88%
Fortschritt: 20.50%
Fortschritt: 21.13%
Fortschritt: 21.75%
Fortschritt: 22.37%
Fortschritt: 22.99%
Fortschritt: 23.61%
Fortschritt: 24.23%
Fortschritt: 24.85%
Fortschritt: 25.48%
Fortschritt: 26.10%
Fortschritt: 26.72%
Fortschritt: 27.34%
Fortschritt: 27.96%
Fortschritt: 28.58%
Fortschritt: 29.20%
Fortschritt: 29.83%
Fortschritt: 30.45%
Fortschritt: 31.07%
Fortschritt: 31.

In [160]:
sim_meta_input = {'sim_pred': y_pred_sim_train_with_score, 'sim_pred3': y_pred_sim_train_with_score3,
                 'sim_score': score_arr, 'sim_score3': score_arr3}
sim_meta_input = pd.DataFrame(sim_meta_input)
#sim_meta_input.to_csv('../data/preprocessed/sim_meta_input.csv')

In [161]:
y_pred_sim_test_with_score = []
y_pred_sim_test_with_score3 = []
score_arr_test = []
score_arr3_test = []
no_movieID = 0
no_rel_movies = 0

for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
    sim_rating, score, sim_rating3, score3, no_movieID, no_rel_movies = calculate_rating_with_score(user_id, movieID, no_movieID, no_rel_movies)
    y_pred_sim_test_with_score.append(sim_rating)
    y_pred_sim_test_with_score3.append(sim_rating3)
    score_arr_test.append(score)
    score_arr3_test.append(score3)
    
print('Prozent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/len(test_data))*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/len(test_data))*100))

Prozent der Filme für die keine ähnlichen gefunden werden konnten: 47.74%
Prozent der Filme für die keine movieID vorhanden ist: 0.00%


In [162]:
sim_meta_input_test = {'sim_pred': y_pred_sim_test_with_score, 'sim_pred3': y_pred_sim_test_with_score3,
                 'sim_score': score_arr_test, 'sim_score3': score_arr3_test}
sim_meta_input_test = pd.DataFrame(sim_meta_input_test)
#sim_meta_input_test.to_csv('../data/preprocessed/sim_meta_input_test.csv')

## Get sim rating and score for train_data and test_data for all

In [164]:
y_pred_sim_all_train_with_score = []
y_pred_sim_all_train_with_score3 = []
all_score_arr = []
all_score_arr3 = []
no_movieID = 0
no_rel_movies = 0
data = len(train_data)
index = 0

for user_id, movieID in zip(train_data['user_id'], train_data['movieID']):
    sim_rating, score, sim_rating3, score3, no_movieID, no_rel_movies = calculate_rating_all_with_score(user_id, movieID, no_movieID, no_rel_movies)
    y_pred_sim_all_train_with_score.append(sim_rating)
    y_pred_sim_all_train_with_score3.append(sim_rating3)
    all_score_arr.append(score)
    all_score_arr3.append(score3)
    index += 1
    if index%5000 == 0:
        print('Fortschritt: {:.2f}%'.format((index/data)*100))

print('\nProzent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/data)*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/data)*100))

Fortschritt: 0.62%
Fortschritt: 1.24%
Fortschritt: 1.86%
Fortschritt: 2.49%
Fortschritt: 3.11%
Fortschritt: 3.73%
Fortschritt: 4.35%
Fortschritt: 4.97%
Fortschritt: 5.59%
Fortschritt: 6.21%
Fortschritt: 6.83%
Fortschritt: 7.46%
Fortschritt: 8.08%
Fortschritt: 8.70%
Fortschritt: 9.32%
Fortschritt: 9.94%
Fortschritt: 10.56%
Fortschritt: 11.18%
Fortschritt: 11.81%
Fortschritt: 12.43%
Fortschritt: 13.05%
Fortschritt: 13.67%
Fortschritt: 14.29%
Fortschritt: 14.91%
Fortschritt: 15.53%
Fortschritt: 16.16%
Fortschritt: 16.78%
Fortschritt: 17.40%
Fortschritt: 18.02%
Fortschritt: 18.64%
Fortschritt: 19.26%
Fortschritt: 19.88%
Fortschritt: 20.50%
Fortschritt: 21.13%
Fortschritt: 21.75%
Fortschritt: 22.37%
Fortschritt: 22.99%
Fortschritt: 23.61%
Fortschritt: 24.23%
Fortschritt: 24.85%
Fortschritt: 25.48%
Fortschritt: 26.10%
Fortschritt: 26.72%
Fortschritt: 27.34%
Fortschritt: 27.96%
Fortschritt: 28.58%
Fortschritt: 29.20%
Fortschritt: 29.83%
Fortschritt: 30.45%
Fortschritt: 31.07%
Fortschritt: 31.

In [165]:
sim_meta_input_all = {'sim_pred': y_pred_sim_all_train_with_score, 'sim_pred3': y_pred_sim_all_train_with_score3,
                 'sim_score': all_score_arr, 'sim_score3': all_score_arr3}
sim_meta_input_all = pd.DataFrame(sim_meta_input_all)
#sim_meta_input_all.to_csv('../data/preprocessed/sim_meta_input_all.csv')

In [166]:
y_pred_sim_test_all_with_score = []
y_pred_sim_test_all_with_score3 = []
all_score_arr_test = []
all_score_arr3_test = []
no_movieID = 0
no_rel_movies = 0

for user_id, movieID in zip(test_data['user_id'], test_data['movieID']):
    sim_rating, score, sim_rating3, score3, no_movieID, no_rel_movies = calculate_rating_all_with_score(user_id, movieID, no_movieID, no_rel_movies)
    y_pred_sim_test_all_with_score.append(sim_rating)
    y_pred_sim_test_all_with_score3.append(sim_rating3)
    all_score_arr_test.append(score)
    all_score_arr3_test.append(score3)
    
print('Prozent der Filme für die keine ähnlichen gefunden werden konnten: {:.2f}%'.format((no_rel_movies/len(test_data))*100))
print('Prozent der Filme für die keine movieID vorhanden ist: {:.2f}%'.format((no_movieID/len(test_data))*100))

Prozent der Filme für die keine ähnlichen gefunden werden konnten: 0.00%
Prozent der Filme für die keine movieID vorhanden ist: 0.00%


In [167]:
sim_meta_input_test_all = {'sim_pred': y_pred_sim_test_all_with_score, 
                           'sim_pred3': y_pred_sim_test_all_with_score3,
                 'sim_score': all_score_arr_test, 'sim_score3': all_score_arr3_test}
sim_meta_input_test_all = pd.DataFrame(sim_meta_input_test_all)
#sim_meta_input_test_all.to_csv('../data/preprocessed/sim_meta_input_test_all.csv')