In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from matplotlib import pyplot as plt
import json
from ortools.sat.python import cp_model
from collections import Counter
import math
from sklearn.preprocessing import StandardScaler
tqdm.pandas()

In [2]:
data_dir = './data/'
books = pd.read_csv(os.path.join(data_dir, 'books.csv'))
books.shape, books.columns

((10000, 23),
 Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
        'isbn13', 'authors', 'original_publication_year', 'original_title',
        'title', 'language_code', 'average_rating', 'ratings_count',
        'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
        'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
        'small_image_url'],
       dtype='object'))

In [3]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [4]:
language_codes = books[books['language_code'].notna()]['language_code'].unique()
language_codes

array(['eng', 'en-US', 'en-CA', 'spa', 'en-GB', 'fre', 'nl', 'ara', 'por',
       'ger', 'nor', 'jpn', 'en', 'vie', 'ind', 'pol', 'tur', 'dan',
       'fil', 'ita', 'per', 'swe', 'rum', 'mul', 'rus'], dtype=object)

In [5]:
for lc in language_codes:
    books['lc {}'.format(lc)] = books.apply(lambda row: 1.0 if row['language_code'] == lc else 0.0, axis=1)

In [6]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url', 'lc eng', 'lc en-US', 'lc en-CA', 'lc spa',
       'lc en-GB', 'lc fre', 'lc nl', 'lc ara', 'lc por', 'lc ger', 'lc nor',
       'lc jpn', 'lc en', 'lc vie', 'lc ind', 'lc pol', 'lc tur', 'lc dan',
       'lc fil', 'lc ita', 'lc per', 'lc swe', 'lc rum', 'lc mul', 'lc rus'],
      dtype='object')

In [7]:
filter_books = books[[
    'id',
    'original_publication_year',
    'average_rating',
    'ratings_count',
    'work_ratings_count',
    'work_text_reviews_count',
    'ratings_1',
    'ratings_2',
    'ratings_3',
    'ratings_4',
    'ratings_5', 'lc eng', 'lc en-US', 'lc en-CA', 'lc spa',
    'lc en-GB', 'lc fre', 'lc nl', 'lc ara', 'lc por', 'lc ger', 'lc nor',
    'lc jpn', 'lc en', 'lc vie', 'lc ind', 'lc pol', 'lc tur', 'lc dan',
    'lc fil', 'lc ita', 'lc per', 'lc swe', 'lc rum', 'lc mul', 'lc rus'
]]

In [8]:
filter_books

Unnamed: 0,id,original_publication_year,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,...,lc pol,lc tur,lc dan,lc fil,lc ita,lc per,lc swe,lc rum,lc mul,lc rus
0,1,2008.0,4.34,4780653,4942365,155254,66715,127936,560092,1481305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1997.0,4.44,4602479,4800065,75867,75504,101676,455024,1156318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2005.0,3.57,3866839,3916824,95009,456191,436802,793319,875073,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1960.0,4.25,3198671,3340896,72586,60427,117415,446835,1001952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1925.0,3.89,2683664,2773745,51992,86236,197621,606158,936012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,2010.0,4.09,17204,18856,1180,105,575,3538,7860,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,9997,1990.0,4.25,12582,12952,395,303,551,1737,3389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,9998,1977.0,4.35,9421,10733,374,11,111,1191,4240,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,9999,2011.0,3.65,11279,11994,1988,275,1002,3765,4577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
scaler = StandardScaler()
filter_books[filter_books.columns[1:]] = scaler.fit_transform(filter_books[filter_books.columns[1:]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [10]:
filter_books

Unnamed: 0,id,original_publication_year,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,...,lc pol,lc tur,lc dan,lc fil,lc ita,lc per,lc swe,lc rum,lc mul,lc rus
0,1,0.170495,1.327789,30.036789,29.098997,24.874634,9.851855,12.846534,19.219327,28.405975,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
1,2,0.098397,1.720847,28.904534,28.250940,11.911526,11.176438,10.143953,15.538545,22.088775,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
2,3,0.150832,-1.698765,24.229710,22.987143,15.037225,68.549476,44.633867,27.389823,16.621846,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
3,4,-0.144116,0.974035,19.983656,19.554820,11.375772,8.904196,11.763752,15.251665,19.088160,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
4,5,-0.373521,-0.440977,16.710904,16.174804,8.012977,12.793850,20.018253,20.833128,17.806397,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,0.183604,0.345141,-0.233838,-0.243340,-0.284117,-0.186886,-0.260984,-0.278083,-0.235314,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
9996,9997,0.052516,0.974035,-0.263210,-0.278526,-0.412300,-0.157045,-0.263454,-0.341177,-0.322223,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
9997,9998,-0.032691,1.367094,-0.283297,-0.291750,-0.415729,-0.201052,-0.308737,-0.360304,-0.305681,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001
9998,9999,0.190159,-1.384318,-0.271490,-0.284235,-0.152179,-0.161265,-0.217039,-0.270131,-0.299130,...,-0.024502,-0.010001,-0.017323,-0.014144,-0.014144,-0.026467,-0.010001,-0.010001,-0.010001,-0.010001


In [18]:
filter_books['original_publication_year'] = filter_books['original_publication_year'].fillna(filter_books['original_publication_year'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_books['original_publication_year'] = filter_books['original_publication_year'].fillna(filter_books['original_publication_year'].mean())


In [20]:
filter_books.to_csv('./data/encode_books.csv', index=False)

In [21]:
np.save('./data/encode_books.npy', filter_books[filter_books.columns[1:]].values)