In [None]:
!pip install python-telegram-bot
!pip install surprise
!pip install lightfm
!pip install fuzzywuzzy[speedup]

# data

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
%cd /content/drive/My Drive/GitHub/minor-goodbooks-recommender

/content/drive/My Drive/GitHub/minor-goodbooks-recommender


In [1]:
import pandas as pd
import numpy as np
import time

from surprise import Reader
from surprise import Dataset as SurpriseDataset
from surprise import KNNWithMeans, SVD, SVDpp
from lightfm import LightFM
from lightfm.data import Dataset as LightfmDataset

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

pd.options.mode.chained_assignment = None



In [2]:
ratings = pd.read_csv('data/ratings.csv')
ratings_random = pd.read_csv('data/ratings_random.csv')
book_map = pd.read_csv('data/books.csv')[['id', 'title', 'authors']]

In [3]:
def itervalues(df):
    for _, i in df.iterrows():
        yield i.values

In [4]:
def setup_lightfm(ratings_data):
    dataset = LightfmDataset()
    dataset.fit(np.append(ratings_data['user_id'].unique(), 999999), range(1,10001))
    
    (interactions, weights) = dataset.build_interactions(itervalues(ratings_data))
    
    model = LightFM(learning_rate=0.05, loss='bpr', random_state=1)
    model.fit(weights, epochs=10)
    
    return dataset, model

In [5]:
dataset, model = setup_lightfm(ratings)

In [6]:
def recommend_list(user_ratings, ratings_data, algorithm, verbose = False, remove_rated = True):
    reader = Reader(rating_scale=(1, 5))
    data_full = SurpriseDataset.load_from_df(ratings_data.append(user_ratings), reader).build_full_trainset()
    
    algorithm.fit(data_full)
    
    preds = []
    for i in range(1, 9901):
        preds.append(algorithm.predict(user_ratings.user_id.unique()[0], i).est)
    
    recs = pd.DataFrame({'book_id' : range(1,9901), 'estimated_rating' : preds})
    
    if remove_rated:
        recs = recs.loc[~recs['book_id'].isin(user_ratings['book_id'])]
    recs = recs.sort_values('estimated_rating', ascending = False).head(10)
    
    return [book_map.loc[i-1, 'title'] for i in recs['book_id']]

In [7]:
def recommend_list_lightfm(user_ratings, ratings_data, algorithm, verbose = False, remove_rated = True):
    import copy
    dataset_local = copy.deepcopy(dataset)
    
    (new_interactions, new_weights) = dataset_local.build_interactions(itervalues(ratings_data.append(user_ratings)))
    
    algorithm_local = copy.deepcopy(algorithm)
    algorithm_local.fit_partial(new_weights)
    algorithm_local.item_biases = np.zeros_like(algorithm_local.item_biases)

    data = {'train' : new_weights,
            'item_features' : None,
            'item_labels' : book_map['title'].values, 
            'item_feature_labels' : None}

    n_users, n_items = data['train'].shape
    user_id = n_users-1

    known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
    scores = algorithm_local.predict(user_id, np.arange(n_items))
    top_items = data['item_labels'][np.argsort(-scores)]

    if remove_rated:
        return [i for i in top_items if i not in known_positives][:10]
    else:
        return top_items[:10]

In [8]:
def fetch_user_ratings_id(user_id):
    df = ratings.loc[ratings['user_id']==user_id, :]
    df['user_id'] = [999999] * len(df)
    return df

In [9]:
def fetch_user_ratings_goodreads(goodreads_id):
    import requests
    url = f"https://www.goodreads.com/review/list/{goodreads_id}?print=true"
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    html_output = requests.get(url=url, headers = headers).text

    import re
    num_books = int(re.findall('books on Goodreads \((.*) books\)', html_output)[0])

    for p in range(2, (num_books // 20 + (num_books % 20 > 0) + 1)):
        url = f"https://www.goodreads.com/review/list/{goodreads_id}?page={p}&print=true"
        html_output += requests.get(url=url, headers = headers).text

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_output, 'html.parser')

    bad_stuff_titles = ['Goodreads Home', None, 'My group discussions', 'Messages', 'Friends', 
                      'did not like it', 'it was ok', 'liked it', 'really liked it', 'it was amazing',]
    books = [i.get('title') for i in soup.find_all('a') 
             if i.get('title') not in bad_stuff_titles]
    
    stars = [i.get('class')[1] for i in soup.find_all('span') 
             if i.get('class') in [['staticStar', 'p0'], ['staticStar', 'p10']] ]
    
    def groupwise(iterable):
        a = iter(iterable)
        return zip(a, a, a, a, a)

    ratings = [(s1, s2, s3, s4, s5).count('p10') for (s1, s2, s3, s4, s5) in groupwise(stars)]

    df = pd.merge(pd.DataFrame({'title' : books, 'rating' : ratings}), book_map, on='title')
    df = df.drop('title', axis=1).rename({'id' : 'book_id'}, axis=1)
    df['user_id'] = [999999] * len(df)
    df = df.reindex(columns=['user_id', 'book_id', 'rating'])
    df = df.loc[df['rating'] != 0]

    if df.empty:
        raise ValueError('No matching books rated')
    else: return df

In [10]:
def fancy_title(title):
    return '<b>' + title + '</b>' + '\n' + str(book_map.loc[book_map['title']==title, 'authors'].values[0])

In [11]:
def fancy_list(reclist):
    text = ''
    for i, title in enumerate(reclist):
        text = text + fancy_title(title) + '\n\n'
        
    return text

# bot

In [12]:
#import logging
#from typing import Tuple, Dict, Any

from telegram import Update, ReplyKeyboardMarkup
from telegram.ext import (
    Updater,
    CommandHandler,
    MessageHandler,
    Filters,
    ConversationHandler,
    CallbackQueryHandler,
    CallbackContext,
)
from getpass import getpass

In [13]:
CHOOSING_SCENARIO, SELECTING_ENGINE = map(chr, range(2))
CHOOSING_USER_ID, CHOOSING_GR_ID, CHOOSING_CUSTOM = map(chr, range(2, 5))
TYPING_USER, TYPING_GOODREADS, TYPING_BOOK, SELECTING_RATING = map(chr, range(5, 9))

In [146]:
#import logging
#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
#                     level=logging.INFO)

In [25]:
def start(update, context):
    context.user_data['user_ratings'] = None
    context.user_data['rated_dict'] = {}
    context.user_data['selected_book'] = None

    text = "Choose scenario"
    buttons = [[
                'Dataset Id',
                'GoodReads Id',
                'Custom Setup'
                ]]
    keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
    update.message.reply_text(text=text, reply_markup=keyboard)

    return CHOOSING_SCENARIO

In [26]:
def recommend(update, context):
    text = "Choose recommender engine"
    buttons = [['KNN', 'SVD', 'LightFM']]

    keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
    update.message.reply_text(text=text, reply_markup=keyboard)

    return SELECTING_ENGINE

In [27]:
def ask_user_id(update, context):
    text = 'Type a number between 0 and ' + str(ratings.user_id.nunique())
    update.message.reply_text(text=text)
    return TYPING_USER

def ask_user_id_again(update, context):
    text = 'Wrong input. Type a number between 0 and ' + str(ratings.user_id.nunique())
    update.message.reply_text(text=text)
    return TYPING_USER

def save_user_id(update, context):
    try:
        user_id = int(update.message.text)
        if user_id not in range(ratings.user_id.nunique()+1):
            raise ValueError
        context.user_data['user_ratings'] = fetch_user_ratings_id(user_id)
        return recommend(update, context)
    except ValueError:
        return ask_user_id_again(update, context)

In [28]:
def ask_goodreads_id(update, context):
    text = 'Type your GoodReads Id'
    update.message.reply_text(text=text)
    return TYPING_GOODREADS

def ask_goodreads_id_again(update, context):
    text = 'User does not exist or has no books from dataset rated. Please try again or choose different scenario by calling /start'
    update.message.reply_text(text=text)
    return TYPING_GOODREADS

def save_goodreads_id(update, context):
    try:
        user_id = int(update.message.text)
        context.user_data['user_ratings'] = fetch_user_ratings_goodreads(user_id)
        return recommend(update, context)
    except ValueError:
        return ask_goodreads_id_again(update, context)

In [29]:
def ask_book_rating(update, context):
    text = 'Type the book title you want to rate or finish rating'
    buttons = [['Finish']]
    keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
    update.message.reply_text(text=text, reply_markup=keyboard)
    return TYPING_BOOK

def save_selected_book(update, context):
    book_name = update.message.text
    try:
        selected_book = process.extract(book_name, book_map['title'].values, scorer=fuzz.ratio)[0][0]
    except:
        update.message.reply_text(text='BAD')
    context.user_data['selected_book'] = selected_book
    
    return show_selected_book(update, context)

def show_selected_book(update, context):
    buttons = [['1','2','3','4','5'],
               ['Cancel', 'Finish']]
    keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
    update.message.reply_text(text=fancy_title(context.user_data['selected_book']), parse_mode = 'HTML')
    update.message.reply_text(text='Now rate it', reply_markup=keyboard)
    
    return SELECTING_RATING

def save_book_rating(update, context):
    book_rating = int(update.message.text)
    context.user_data['rated_dict'][context.user_data['selected_book']] = book_rating
    
    return ask_book_rating(update, context)

def rating_finished(update, context):
    books, ratings = zip(*context.user_data['rated_dict'].items())
    book_ids = [book_map.loc[book_map['title']==i, 'id'].values[0] for i in books]
    context.user_data['user_ratings'] = pd.DataFrame(
        {'user_id' : [999999]*len(book_ids), 'book_id' : book_ids, 'rating' : ratings})
    
    return recommend(update, context)

In [30]:
def rec_knn(update, context):
    warn = update.message.reply_text(text='This takes time')
    knn = KNNWithMeans(k=9, verbose=False)
    reclist = recommend_list(context.user_data['user_ratings'], ratings, knn, verbose = False)
    
    #update.message.reply_text(text=fancy_list(reclist), parse_mode = 'HTML')
    warn.edit_text(text=fancy_list(reclist), parse_mode = 'HTML')

In [31]:
def rec_svd(update, context):
    warn = update.message.reply_text(text='This takes time')
    svd = SVD(n_factors=20, verbose=False)
    reclist = recommend_list(context.user_data['user_ratings'], ratings, svd, verbose = False)

    #update.message.reply_text(text=fancy_list(reclist), parse_mode = 'HTML')
    warn.edit_text(text=fancy_list(reclist), parse_mode = 'HTML')

In [32]:
def rec_lightfm(update, context):
    warn = update.message.reply_text(text='This takes time')
    reclist = recommend_list_lightfm(context.user_data['user_ratings'], ratings, model, verbose = False)

    #update.message.reply_text(text=fancy_list(reclist), parse_mode = 'HTML')
    warn.edit_text(text=fancy_list(reclist), parse_mode = 'HTML')

In [33]:
convhandler = ConversationHandler(
    entry_points = [CommandHandler('start', start)],
    states = {
        CHOOSING_SCENARIO: [MessageHandler(Filters.regex('^Dataset Id$'), ask_user_id),
                            MessageHandler(Filters.regex('^GoodReads Id$'), ask_goodreads_id),
                            MessageHandler(Filters.regex('^Custom Setup$'), ask_book_rating)],
        TYPING_USER: [MessageHandler(Filters.text & ~Filters.command, save_user_id)],
        TYPING_GOODREADS: [MessageHandler(Filters.text & ~Filters.command, save_goodreads_id)],
        TYPING_BOOK: [MessageHandler(Filters.text & ~Filters.command & ~Filters.regex('^Finish$'), save_selected_book),
                      MessageHandler(Filters.regex('^Finish$'), rating_finished)],
        SELECTING_RATING: [MessageHandler(Filters.regex('^[1-5]$'), save_book_rating),
                           MessageHandler(Filters.regex('^Finish$'), rating_finished),
                           MessageHandler(Filters.regex('^Cancel$'), ask_book_rating)],
        SELECTING_ENGINE: [MessageHandler(Filters.regex('^KNN$'), rec_knn),
                           MessageHandler(Filters.regex('^SVD$'), rec_svd),
                           MessageHandler(Filters.regex('^LightFM$'), rec_lightfm)]
    },
    fallbacks = [CommandHandler('start', start)]
)

In [31]:
#def main():
#    updater = Updater(token=getpass(), use_context=True)
#    dispatcher = updater.dispatcher
#    dispatcher.add_handler(convhandler)
#    updater.start_polling()

#if __name__ == '__main__':
#    main()

In [34]:
updater = Updater(token=getpass(), use_context=True)
dispatcher = updater.dispatcher
dispatcher.add_handler(convhandler)
updater.start_polling()

········


<queue.Queue at 0x1bb341cd490>

# stop

In [35]:
updater.stop()