In [130]:
!pip install python-telegram-bot



In [131]:
!pip install surprise



In [132]:
!pip install lightfm



In [133]:
!pip install fuzzywuzzy[speedup]



# data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/GitHub/minor-goodbooks-recommender

/content/drive/My Drive/GitHub/minor-goodbooks-recommender


In [3]:
import pandas as pd
import numpy as np
import time

from surprise import Reader
from surprise import Dataset as SurpriseDataset
from surprise import KNNWithMeans, SVD, SVDpp
from lightfm import LightFM
from lightfm.data import Dataset as LightfmDataset

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [4]:
ratings = pd.read_csv('data/ratings.csv')
ratings_random = pd.read_csv('data/ratings_random.csv')
book_map = pd.read_csv('data/books.csv')[['id', 'title']]

In [5]:
def itervalues(df):
  for _, i in df.iterrows():
    yield i.values

In [6]:
def setup_lightfm(ratings_data):
  dataset = LightfmDataset()
  dataset.fit(np.append(ratings_data['user_id'].unique(), 999999), range(1,10001))

  (interactions, weights) = dataset.build_interactions(itervalues(ratings_data))

  model = LightFM(learning_rate=0.05, loss='bpr')
  model.fit(weights, epochs=10)

  return dataset, model

In [7]:
dataset, model = setup_lightfm(ratings_random)

In [8]:
def recommend_list(user_ratings, ratings_data, algorithm, verbose = False, remove_rated = True):
  reader = Reader(rating_scale=(1, 5))
  data_full = SurpriseDataset.load_from_df(ratings_data.append(user_ratings), reader).build_full_trainset()

  algorithm.fit(data_full)

  preds = []
  for i in range(1, 9901):
    preds.append(algorithm.predict(user_ratings.user_id.unique()[0], i).est)
  
  recs = pd.DataFrame({'book_id' : range(1,9901), 'estimated_rating' : preds})
  if remove_rated:
    recs = recs.loc[~recs['book_id'].isin(user_ratings['book_id'])]
  recs = recs.sort_values('estimated_rating', ascending = False).head(10)

  return [book_map.loc[i-1, 'title'] for i in recs['book_id']]

In [9]:
def recommend_list_lightfm(user_ratings, ratings_data, algorithm, verbose = False, remove_rated = True):
  import copy
  dataset_local = copy.deepcopy(dataset)

  def itervalues(df):
    for _, i in df.iterrows():
      yield i.values

  (new_interactions, new_weights) = dataset_local.build_interactions(itervalues(ratings_data.append(user_ratings)))

  algorithm_local = copy.deepcopy(algorithm)
  algorithm_local.fit_partial(new_weights)
  algorithm_local.item_biases = np.zeros_like(algorithm_local.item_biases)

  data = {'train' : new_weights, 
          'item_features' : None, 
          'item_labels' : book_map['title'].values, 
          'item_feature_labels' : None}

  n_users, n_items = data['train'].shape
  user_id = n_users-1

  known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
  scores = algorithm_local.predict(user_id, np.arange(n_items))
  top_items = data['item_labels'][np.argsort(-scores)]

  if remove_rated:
    return [i for i in top_items if i not in known_positives][:10]
  else:
    return top_items[:10]

In [10]:
def fetch_user_ratings_id(user_id):
  pd.options.mode.chained_assignment = None
  user_ratings = ratings.loc[ratings['user_id']==user_id, :]
  user_ratings['user_id'] = 999999
  return user_ratings

In [11]:
def fetch_user_ratings_goodreads(goodreads_id):
  import requests

  url = f"https://www.goodreads.com/review/list/{goodreads_id}?print=true"
  headers = {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 \
      (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

  page = requests.get(url=url, headers = headers).text

  for i in range(2, 11):
    url = f"https://www.goodreads.com/review/list/{goodreads_id}?page={i}&print=true"
    page += requests.get(url=url, headers = headers).text

  from bs4 import BeautifulSoup
  soup = BeautifulSoup(page, 'html.parser')

  bad_stuff_titles = ['Goodreads Home', None, 'My group discussions', 'Messages', 'Friends', 
                      'did not like it', 'it was ok', 'liked it', 'really liked it', 'it was amazing',]
  titles = [i.get('title') for i in soup.find_all('a')]
  books = [i for i in titles if i not in bad_stuff_titles]
  
  bad_stuff_ratings = [None,
                      ['headerPersonalNav__icon', 'headerPersonalNav__icon--notifications'],
                      ['headerPersonalNav__icon', 'headerPersonalNav__icon--discussions'],
                      ['headerPersonalNav__icon', 'headerPersonalNav__icon--inbox'],
                      ['headerPersonalNav__icon', 'headerPersonalNav__icon--friendRequests'],
                      ['controlGroup', 'uitext'],
                      ['bookMeta'],
                      ['greyText'],
                      ['greyText', 'smallText'],
                      ['greyText'],
                      ['', 'staticStars', 'notranslate'],
                      ['darkGreyText'], 
                      ['date_started_value'], 
                      ['date_read_value'],
                      ['previous_page', 'disabled'],
                      ['next_page', 'disabled'],]
  rates = [i.get('class') for i in soup.find_all('span')]
  stars = [i[1] for i in rates if i not in bad_stuff_ratings]

  def groupwise(iterable):
    a = iter(iterable)
    return zip(a, a, a, a, a)

  ratings = [(s1, s2, s3, s4, s5).count('p10') for (s1, s2, s3, s4, s5) in groupwise(stars)]

  df = pd.merge(pd.DataFrame({'title' : books, 'rating' : ratings}), book_map, on='title')
  df = df.drop('title', axis=1).rename({'id' : 'book_id'}, axis=1)
  df['user_id'] = [999999] * len(df)
  df = df.reindex(columns=['user_id', 'book_id', 'rating'])
  df = df.loc[df['rating'] != 0]

  if df.empty:
    raise ValueError('No matching books rated')
  else: return df

# bot

In [64]:
#import logging
#from typing import Tuple, Dict, Any

from telegram import Update, ReplyKeyboardMarkup
from telegram.ext import (
    Updater,
    CommandHandler,
    MessageHandler,
    Filters,
    ConversationHandler,
    CallbackQueryHandler,
    CallbackContext,
)

In [65]:
# State definitions for top level conversation
CHOOSING_SCENARIO, CHOOSING_USER_ID, CHOOSING_GR_ID, CHOOSING_CUSTOM = map(chr, range(4))
# State definitions for second level conversation
SAVING_USER_ID, SELECTING_GENDER = map(chr, range(4, 6))
# State definitions for descriptions conversation
SELECTING_ENGINE, TYPING_USER, TYPING_GOODREADS = map(chr, range(6, 9))
# Meta states
TYPING_BOOK, SELECTING_RATING = map(chr, range(9, 11))
# Shortcut for ConversationHandler.END
END = ConversationHandler.END

In [66]:
from telegram.ext import Updater
from getpass import getpass
#updater = Updater(token=getpass(), use_context=True)
updater = Updater(token='1776136579:AAEkS7z3Lr3PrZMDMiXpWKD-OpR7P305K4M', use_context=True)

In [67]:
#import logging
#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
#                     level=logging.INFO)

In [68]:
def start(update, context):
    context.user_data['user_ratings'] = None
    text = "Choose scenario"

    buttons = [[
                'Dataset Id',
                'GoodReads Id',
                'Custom Setup'
                ]]
    keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
    update.message.reply_text(text=text, reply_markup=keyboard)
    context.user_data['rated_dict'] = {}

    return CHOOSING_SCENARIO

In [69]:
def recommend(update, context):
  text = "Choose recommender engine"

  buttons = [['KNN', 'SVD', 'LightFM']]

  keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
  update.message.reply_text(text=text, reply_markup=keyboard)

  return SELECTING_ENGINE

In [70]:
def ask_user_id(update, context):
  text = 'Type a number between 0 and ' + str(ratings.user_id.nunique())
  update.message.reply_text(text=text)
  return TYPING_USER

def ask_user_id_again(update, context):
  text = 'Wrong input. Type a number between 0 and ' + str(ratings.user_id.nunique())
  update.message.reply_text(text=text)
  return TYPING_USER

def save_user_id(update, context):
  try:
    user_id = int(update.message.text)
    if user_id not in range(ratings.user_id.nunique()+1):
      raise ValueError
    context.user_data['user_ratings'] = fetch_user_ratings_id(user_id)
    return recommend(update, context)
  except ValueError:
    return ask_user_id_again(update, context)

In [71]:
def ask_goodreads_id(update, context):
  text = 'Type your GoodReads Id'
  update.message.reply_text(text=text)
  return TYPING_GOODREADS

def ask_goodreads_id_again(update, context):
  text = 'User does not exist or has no books from dataset rated. Please try again or choose different scenario by calling /start'
  update.message.reply_text(text=text)
  return TYPING_GOODREADS

def save_goodreads_id(update, context):
  try:
    user_id = int(update.message.text)
    context.user_data['user_ratings'] = fetch_user_ratings_goodreads(user_id)
    return recommend(update, context)
  except ValueError:
    return ask_goodreads_id_again(update, context)

In [72]:
def ask_book_rating(update, context):
  text = 'Type the book title you want to rate or finish rating'
  buttons = [['Finish']]
  keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
  update.message.reply_text(text=text, reply_markup=keyboard)
  return TYPING_BOOK

def save_selected_book(update, context):
  book_name = update.message.text
  try:
    selected_book = process.extract(book_name, book_map['title'].values, scorer=fuzz.ratio)[0][0]
  except:
    update.message.reply_text(text='BAD')
  context.user_data['selected_book'] = selected_book

  return show_selected_book(update, context)

def show_selected_book(update, context):
  buttons = [['1','2','3','4','5'],
             ['Cancel', 'Finish']]
  keyboard = ReplyKeyboardMarkup(buttons, one_time_keyboard=True)
  update.message.reply_text(text=context.user_data['selected_book'])
  update.message.reply_text(text='Now rate it', reply_markup=keyboard)

  return SELECTING_RATING

def save_book_rating(update, context):
  book_rating = int(update.message.text)
  context.user_data['rated_dict'][context.user_data['selected_book']] = book_rating
  #update.message.reply_text(text=str(context.user_data['rated_dict']))

  return ask_book_rating(update, context)

def rating_finished(update, context):
  books, ratings = zip(*context.user_data['rated_dict'].items())
  book_ids = [book_map.loc[book_map['title']==i, 'id'].values[0] for i in books]
  context.user_data['user_ratings'] = pd.DataFrame({'user_id' : [999999]*len(book_ids), 'book_id' : book_ids, 'rating' : ratings})

  #update.message.reply_text(text=str(context.user_data['user_ratings']))

  return recommend(update, context)

In [73]:
def warn_knn(update, context):
  text = 'This takes time'
  update.message.reply_text(text=text)
  return rec_knn(update, context)

def rec_knn(update, context):
  knn = KNNWithMeans(k=9, verbose=False)
  reclist = recommend_list(context.user_data['user_ratings'], ratings_random, knn, verbose = False)

  #rec_table = pd.merge(rec_table, book_map.rename({'id' : 'book_id'}, axis = 1), on='book_id')

  text = ''
  for i, title in enumerate(reclist):
    text = text + str(i+1) + '. ' + str(title) + '\n'

  update.message.reply_text(text=text)

In [74]:
def warn_svd(update, context):
  text = 'This takes time'
  update.message.reply_text(text=text)
  return rec_svd(update, context)

def rec_svd(update, context):
  svd = SVD(n_factors=20, verbose=False)
  reclist = recommend_list(context.user_data['user_ratings'], ratings_random, svd, verbose = False)

  #rec_table = pd.merge(rec_table, book_map.rename({'id' : 'book_id'}, axis = 1), on='book_id')

  text = ''
  for i, title in enumerate(reclist):
    text = text + str(i+1) + '. ' + str(title) + '\n'

  update.message.reply_text(text=text)

In [75]:
def warn_lightfm(update, context):
  text = 'This takes time'
  update.message.reply_text(text=text)
  return rec_lightfm(update, context)

def rec_lightfm(update, context):
  reclist = recommend_list_lightfm(context.user_data['user_ratings'], ratings_random, model, verbose = False)

  #rec_table = pd.merge(rec_table, book_map.rename({'id' : 'book_id'}, axis = 1), on='book_id')

  text = ''
  for i, title in enumerate(reclist):
    text = text + str(i+1) + '. ' + str(title) + '\n'

  update.message.reply_text(text=text)

In [76]:
dispatcher = updater.dispatcher

In [77]:
dispatcher.add_handler(ConversationHandler(
  entry_points = [CommandHandler('start', start)],
  states = {
      CHOOSING_SCENARIO: [MessageHandler(Filters.regex('^Dataset Id$'), ask_user_id),
                          MessageHandler(Filters.regex('^GoodReads Id$'), ask_goodreads_id),
                          MessageHandler(Filters.regex('^Custom Setup$'), ask_book_rating)],
      TYPING_USER: [MessageHandler(Filters.text & ~Filters.command, save_user_id)],
      TYPING_GOODREADS: [MessageHandler(Filters.text & ~Filters.command, save_goodreads_id)],
      TYPING_BOOK: [MessageHandler(Filters.text & ~Filters.command & ~Filters.regex('^Finish$'), save_selected_book),
                    MessageHandler(Filters.regex('^Finish$'), rating_finished)],
      SELECTING_RATING: [MessageHandler(Filters.regex('^[1-5]$'), save_book_rating),
                         MessageHandler(Filters.regex('^Finish$'), rating_finished),
                         MessageHandler(Filters.regex('^Cancel$'), ask_book_rating)],
      SELECTING_ENGINE: [MessageHandler(Filters.regex('^KNN$'), warn_knn),
                         MessageHandler(Filters.regex('^SVD$'), warn_svd),
                         MessageHandler(Filters.regex('^LightFM$'), warn_lightfm)]
            },
  fallbacks = [CommandHandler('start', start)]))

In [78]:
updater.start_polling()

<queue.Queue at 0x7fb812f03a10>

# Новый раздел

In [63]:
updater.stop()

In [45]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
process.extract('the hobbit', book_map['title'].values, scorer=fuzz.ratio)[0][0]

'The Hobbit'

In [None]:
choices = book_map['title'].values
process.extract(book_name, choices, scorer=fuzz.ratio)[0][0]

'The Hobbit'

In [None]:
book_name = '1984'
selected_book = process.extract(book_name, book_map['title'].values, scorer=fuzz.ratio)[0][0]
selected_book

str

In [None]:
selected_book

'The Hobbit'

In [None]:
  entry_points = [CommandHandler('start', start)],
  states = {
      CHOOSING_SCENARIO: [MessageHandler(Filters.regex('^Dataset Id$'), ask_user_id),
                          MessageHandler(Filters.regex('^GoodReads Id$'), ask_goodreads_id),
                          MessageHandler(Filters.regex('^Custom Setup$'), ask_book_rating)],
      TYPING_USER: [MessageHandler(Filters.text & ~Filters.command, save_user_id)],
      TYPING_GOODREADS: [MessageHandler(Filters.text & ~Filters.command, save_goodreads_id)],
      TYPING_BOOK: [MessageHandler(Filters.text & ~Filters.command, save_selected_book),
                    MessageHandler(Filters.regex('^Finish$') & ~Filters.command, rating_finished)],
      SELECTING_RATING: [MessageHandler(Filters.regex('^[1-5]$'), save_book_rating),
                         MessageHandler(Filters.regex('^Finish$'), rating_finished),
                         MessageHandler(Filters.regex('^Cancel$'), ask_book_rating)],
      SELECTING_ENGINE: [MessageHandler(Filters.regex('^KNN$'), warn_knn),
                         MessageHandler(Filters.regex('^SVD$'), warn_svd),
                         MessageHandler(Filters.regex('^LightFM$'), warn_lightfm)]
            },
  fallbacks = [CommandHandler('start', start)]))

In [None]:
fetch_user_ratings_goodreads(88851923)

Unnamed: 0,user_id,book_id,rating
1,999999,5,2
15,999999,576,3
22,999999,14,1
23,999999,155,1
24,999999,19,3
25,999999,54,1
27,999999,255,5
28,999999,8,2
29,999999,6055,4
30,999999,357,5


In [None]:
[book_map.loc[i-1, 'title'] for i in fetch_user_ratings_goodreads(88851923)['book_id']]

['The Great Gatsby',
 'Candide',
 'Animal Farm',
 'The Two Towers (The Lord of the Rings, #2)',
 'The Fellowship of the Ring (The Lord of the Rings, #1)',
 "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)",
 'Atlas Shrugged',
 'The Catcher in the Rye',
 'Three Comrades',
 'All Quiet on the Western Front',
 'Stoner']

In [None]:
recommend_list_lightfm(fetch_user_ratings_goodreads(88851923), ratings_random, model)

["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",
 'The Hunger Games (The Hunger Games, #1)',
 'To Kill a Mockingbird',
 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
 'Harry Potter and the Order of the Phoenix (Harry Potter, #5)',
 'The Hobbit',
 'Harry Potter and the Chamber of Secrets (Harry Potter, #2)',
 'The Great Gatsby',
 'Harry Potter and the Goblet of Fire (Harry Potter, #4)',
 'Harry Potter and the Half-Blood Prince (Harry Potter, #6)']

In [None]:
fetch_user_ratings_goodreads(88851923)

Unnamed: 0,user_id,book_id,rating
1,999999,5,2
15,999999,576,3
22,999999,14,1
23,999999,155,1
24,999999,19,3
25,999999,54,1
27,999999,255,5
28,999999,8,2
29,999999,6055,4
30,999999,357,5
