# Display Sample Records

In [222]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd

**Specify your directory here:**

In [111]:
DIR = '/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/raw'

**This function shows how to load datasets**

In [112]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

**Load and display sample records of books**

In [113]:
full_poetry_books = load_data(os.path.join(DIR, 'goodreads_books_poetry.json.gz'))

In [115]:
print(' == sample record (books) ==')
display(np.random.choice(poetry_books))

 == sample record (books) ==


{'isbn': '0393065243',
 'text_reviews_count': '3',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '21', 'name': 'to-read'},
  {'count': '9', 'name': 'poetry'},
  {'count': '2', 'name': 'currently-reading'},
  {'count': '2', 'name': 'jewish'},
  {'count': '2', 'name': 'israel'},
  {'count': '1', 'name': 'read-2017'},
  {'count': '1', 'name': 'to-read-2'},
  {'count': '1', 'name': 'favorites'},
  {'count': '1', 'name': 'books-you-should-read'},
  {'count': '1', 'name': 'foreign-literature-in-translation'},
  {'count': '1', 'name': 'translations'},
  {'count': '1', 'name': 'unsorted'},
  {'count': '1', 'name': 'to-be-purchased'},
  {'count': '1', 'name': 'to-buy'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '3.93',
 'kindle_asin': '',
 'similar_books': [],
 'description': 'In poems about fathers and daughters, men and women, kings and their subjects, the precarious position of women and the plight of Palestinians under the Occupation

**Load and display sample records of user-book interactions (shelves)**

In [97]:
interactions = load_data(os.path.join(DIR, 'goodreads_interactions_poetry.json.gz'))

In [106]:
print(' == sample record (interaction) ==')
display(np.random.choice(interactions))

 == sample record (interaction) ==


{'user_id': '90181ceb30afe97805078c900f5ec53b',
 'book_id': '6585485',
 'review_id': '51dffcb515eb0d52781654d248140f61',
 'is_read': True,
 'rating': 5,
 'review_text_incomplete': '',
 'date_added': 'Sat Jul 14 02:30:51 -0700 2012',
 'date_updated': 'Sat Jul 14 02:31:12 -0700 2012',
 'read_at': 'Sat Jul 14 02:31:12 -0700 2012',
 'started_at': ''}

## Data Sampling

#### Since dataset is too large to handle in limited resource, we will sample 200k samples. 

In [116]:
# Load Book
full_book_df = pd.read_json(os.path.join(DIR, 'goodreads_books_poetry.json.gz'), lines=True,)

# load Interactions
full_interactions_df = pd.read_json(os.path.join(DIR, 'goodreads_interactions_poetry.json.gz'), lines=True)

# Load Reviews
full_reviews_df = pd.read_json(os.path.join(DIR, 'goodreads_reviews_poetry.json.gz'), lines=True)

In [117]:
SAMPLE_SIZE = 200000

In [118]:
book_genere_info = pd.read_json('/Users/spartan/Downloads/goodreads_book_genres_initial.json', lines=True)

In [119]:
total_genres = set()

def get_total_genres(genres):
    for genre in genres:
        total_genres.add(genre)
book_genere_info['genres'].apply(lambda x: get_total_genres(x.keys()))

# user_interest = np.random.choice(list(total_genres), SAMPLE_SIZE)


0          None
1          None
2          None
3          None
4          None
           ... 
2360650    None
2360651    None
2360652    None
2360653    None
2360654    None
Name: genres, Length: 2360655, dtype: object

In [120]:
interactions_df_subset = interactions_df[:SAMPLE_SIZE]
interactions_df_subset.drop(['date_updated','read_at','started_at'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df_subset.drop(['date_updated','read_at','started_at'], axis=1, inplace=True)


In [125]:
t = interactions_df_subset.user_id.value_counts()
# Generate features for each user
user_interest = np.random.choice(list(total_genres), len(t))
user_age = np.random.randint(18, 100, len(t))

In [148]:
user_df = pd.DataFrame(data = {"general_interest": user_interest, "Age" : user_age}, index = t.index)

In [157]:
user_df.reset_index(inplace=True)

In [163]:
user_df.rename(columns={'index':'user_id'}, inplace=True)

In [183]:
user_df.head()

Unnamed: 0,user_id,general_interest,Age
0,55bea8421fbd1f96790bcaa6302b5aab,"mystery, thriller, crime",40
1,3f0d11375b87707eac2137d68564a0b7,"history, historical fiction, biography",72
2,fba91ff84f33aab3a03852afca1f9942,"comics, graphic",92
3,2a51426b2df337ff8eae7846217ad083,children,98
4,cbcbba3acce70042b61df6fcb2dd3781,"mystery, thriller, crime",96


In [169]:
user_df.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/users_data.csv', index=False)

In [122]:
full_book_df = full_book_df[['book_id','title','ratings_count','country_code','is_ebook','average_rating']]

In [82]:
full_reviews_df

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,,,0,0
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,5,This is my favorite collection of poetry.,Mon Apr 14 18:42:40 -0700 2014,Mon Apr 14 18:43:05 -0700 2014,Wed Jan 01 00:00:00 -0800 1997,,0,0
2,0ef32090550901ead25cb0ea21c4d36b,908708,bca57fa40e92c9261b00b03dbebd96fe,4,"He's so disturbing. So very, very disturbing.",Tue Apr 22 13:58:10 -0700 2008,Tue Apr 22 13:58:33 -0700 2008,,,0,0
3,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,5,I just reread this play for a class I am takin...,Wed Sep 27 19:08:08 -0700 2017,Sat Sep 30 06:39:45 -0700 2017,Wed Sep 27 00:00:00 -0700 2017,Tue Sep 26 00:00:00 -0700 2017,1,0
4,af157d0205b8a901dee6d4a2aed7e6ad,70885,8dca128b8e869048a7442c18659dbece,5,"Cuanto mas leo, mas me gusta. Su poesia es env...",Thu Jun 18 20:00:03 -0700 2015,Thu Jun 18 20:01:29 -0700 2015,Thu Jun 18 00:00:00 -0700 2015,Tue Jun 16 00:00:00 -0700 2015,0,0
...,...,...,...,...,...,...,...,...,...,...,...
154550,ec8f7af656d6e448eea5f47ae504e706,15997,4d85ed12430e6d742f27e7e250c5a7d8,5,"When Eve finds Adam, he drops the wreath and i...",Mon Mar 17 09:35:01 -0700 2008,Fri Aug 16 12:00:56 -0700 2013,Wed Feb 08 00:00:00 -0800 1995,,0,0
154551,c1485b8d8ea2a606548077a09a488f74,4134951,d73ba08be95a72ec85b52fabf609638d,2,sh`rhyy khml `shqnh w Hsy z dydgh ykh mrd. bry...,Wed Aug 13 12:06:35 -0700 2008,Thu Nov 27 09:48:30 -0800 2014,Sat Mar 14 01:01:47 -0700 2009,,1,0
154552,c1485b8d8ea2a606548077a09a488f74,627166,e9b11541d1be53d2b8c41fc81e255fb0,3,nZr ahry nwysndh y st khh dr Hwzh dbyt dstny m...,Mon Nov 19 21:38:30 -0800 2007,Sun Jan 05 07:07:58 -0800 2014,,,4,0
154553,8ba77e3c745ebddccc6306fc3c6bb25e,9404584,844d9609e96ec611166ef20bf270233f,5,Emily Dickinson left a large cache of poetry -...,Tue Sep 04 14:58:29 -0700 2012,Tue Dec 24 03:26:39 -0800 2013,Fri Sep 21 10:09:53 -0700 2012,Tue Sep 04 14:58:29 -0700 2012,4,1


In [174]:
df1 = full_book_df[full_book_df['book_id'].isin(interactions_df_subset.book_id)]

In [178]:
df1.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/books_data.csv', index=False)

In [180]:
interactions_df_subset['date_added'] = pd.to_datetime(interactions_df_subset['date_added'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df_subset['date_added'] = pd.to_datetime(interactions_df_subset['date_added'])


In [182]:
interactions_df_subset.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv', index=False)

## convert user_id and book book_id to integer

In [206]:
interactions_df = pd.read_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv')

In [207]:
u_ids = interactions_df.user_id

In [208]:
i_ids = interactions_df.book_id

In [209]:
user_dict={}
for index, value in np.ndenumerate(u_ids.unique()):
    user_dict[value] = index[0]+1

In [210]:
len(user_dict)

20627

In [211]:
item_dict={}
for index, value in np.ndenumerate(i_ids.unique()):
    item_dict[value] = index[0]+1

In [212]:
item_dict

{1384: 1,
 1376: 2,
 30119: 3,
 24769928: 4,
 240007: 5,
 23513349: 6,
 17805212: 7,
 10843755: 8,
 2696: 9,
 30118: 10,
 23919: 11,
 1420: 12,
 6295: 13,
 17707772: 14,
 20821284: 15,
 2547: 16,
 18743: 17,
 52820: 18,
 543235: 19,
 1381: 20,
 15997: 21,
 28188156: 22,
 22324396: 23,
 20177773: 24,
 21852315: 25,
 16180680: 26,
 1724365: 27,
 1371: 28,
 131885: 29,
 35606560: 30,
 129636: 31,
 22151696: 32,
 11904233: 33,
 262304: 34,
 15812153: 35,
 95819: 36,
 53022: 37,
 42038: 38,
 27494: 39,
 402128: 40,
 26114416: 41,
 25115796: 42,
 3611540: 43,
 23534: 44,
 20613761: 45,
 92270: 46,
 12907847: 47,
 9583799: 48,
 908708: 49,
 19351: 50,
 12914: 51,
 253264: 52,
 28212625: 53,
 845331: 54,
 70885: 55,
 15829076: 56,
 676: 57,
 1432: 58,
 6798263: 59,
 29875917: 60,
 25937948: 61,
 28116744: 62,
 24611931: 63,
 61049: 64,
 46231: 65,
 7428663: 66,
 1715: 67,
 1519: 68,
 32552: 69,
 1503869: 70,
 195769: 71,
 2325390: 72,
 732562: 73,
 765172: 74,
 16075147: 75,
 13588404: 76,
 78

In [213]:
interactions_df['user_id'] = interactions_df['user_id'].map(user_dict)

In [214]:
interactions_df['book_id'] = interactions_df['book_id'].map(item_dict)

In [216]:
interactions_df.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv', index=False)

In [217]:
books_df = pd.read_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/books_data.csv')
user_df = pd.read_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/users_data.csv')

In [218]:
books_df['book_id'] = books_df['book_id'].map(item_dict)
user_df['user_id'] = user_df['user_id'].map(user_dict)

In [220]:
user_df.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/users_data.csv', index=False)
books_df.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/books_data.csv', index=False)

#### process time attribute

In [224]:
interactionms = pd.read_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv')

In [226]:
interactionms['timestamp'] = interactionms['date_added'].apply(lambda x: int(pd.Timestamp(x).timestamp()))

In [235]:
interactionms.reset_index(inplace=True)

In [236]:
interactionms.rename(columns={'index':'review_id'}, inplace=True)

In [233]:
interactionms.drop(['review_id'], axis=1, inplace=True)

In [254]:
interactionms.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv', index=False)

In [245]:
ratings = np.random.randint(1, 6, len(interactionms))

In [247]:
interactionms['ratings'] = ratings



In [253]:
interactionms.drop(['rating'], axis=1, inplace=True)