In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, GridSearchCV
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from scipy.sparse import csr_matrix

In [None]:
movies = pd.read_csv('clean_movies.csv')
movies.head()

In [None]:
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.head()

In [None]:

movies['id'] = movies['id'].astype('int')
small_movies = movies[movies['id'].isin(links_small)]
small_movies.shape

In [None]:
overviews = small_movies['overview'].fillna('')
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matirx = tfidf.fit_transform(overviews)
cosine_sim = cosine_similarity(tfidf_matirx, tfidf_matirx)
tfidf_matirx.shape

In [None]:
small_movies = small_movies.reset_index()
titles = small_movies['original_title']
indices = pd.Series(small_movies.index, index=small_movies['original_title'])

In [None]:
def get_movie(title):
    index = indices[title]
    scores = list(enumerate(cosine_sim[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:51]
    movie_indices = [i[0] for i in scores]
    return titles.iloc[movie_indices]

In [None]:
get_movie('The Godfather').head(10)

In [None]:

reader = Reader(rating_scale=(1,5))
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

In [None]:
ratings2 = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
params = {'n_factors': [25, 50, 100],
         'reg_all': [.02, .05, .1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(ratings2)
print("Best Parameters:", g_s_svd.best_params)

In [None]:
svd = SVD(n_factors=25, reg_all=0.05)
cross_validate(svd, ratings2, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
training = ratings2.build_full_trainset()
svd.fit(training)

In [None]:
svd.predict(1,31)

In [None]:
svd.predict(1,1061)

In [None]:
svd.predict(1,1029)

In [None]:
svd.predict(1,1129)

In [None]:
links = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
small_movies.rename(columns={'original_title': 'title'}, inplace=True)
links_small_df = links_small.reset_index()
links_small_df.columns = ['movieId', 'id']
combo_set = links_small_df.merge(small_movies[['title', 'id']], on = 'id').set_index('title')
wombo_set = combo_set.set_index('id')
combo_set.head()

In [None]:
def wombo_combo(userId, title):
    index = indices[title]
    tmdbId = combo_set.loc[title]['id']
    movieId = combo_set.loc[title]['movieId']
    scores = list(enumerate(cosine_sim[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = scores[1:26]
    movie_indices = [i[0] for i in scores]
    movies = small_movies.iloc[movie_indices][['title', 'id']]
    movies['prediction'] = movies['id'].apply(lambda x: svd.predict(userId, wombo_set.loc[x]['movieId']).est)
    movies = movies.sort_values('prediction', ascending=False)
    return movies.head(10)

In [17]:
wombo_combo(112, 'The Terminator')

Unnamed: 0,title,id,prediction
4420,Just Married,12090,4.429952
1716,Surf Nazis Must Die,28070,4.416248
930,Delicatessen,892,4.097667
6174,Must Love Dogs,11648,4.053477
2832,The Hunger,11654,3.935056
1526,Labyrinth,13597,3.915452
8744,Terminator Genisys,87101,3.898967
7407,The Book of Eli,20504,3.879114
7208,Terminator Salvation,534,3.879114
8297,Stories We Tell,128216,3.879114


In [None]:
wide_ratings = pd.read_csv('ratings_wide.csv')
first_2_rows = wide_ratings.head(2)
first_2_rows = first_2_rows.fillna(0)
wide_ratings.head()

In [None]:
wide_ratings = pd.read_csv('ratings_wide.csv')
first_2_rows = wide_ratings.head(2)
first_2_rows = first_2_rows.fillna(0)
wide_ratings.head()

In [None]:
itemsets = apriori(first_2_rows, min_support=0.05, use_colnames=True)