In [None]:
import json
import os

import pandas as pd
import time

%pylab inline
import matplotlib.pyplot as plt

import sklearn

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import sklearn.linear_model

sys.path.append('../scripts')

import process_goodreads_data as gm

sys.path.append('../../code/scripts')
import dataset_chunking_fxns
from dataset_chunking_fxns import subsample_df_by_groups
import plotting

### This notebook looks at pairs of genres from goodreads, and investigates the effect of  number of samples from each genre in the training set on overall accuracy of a learned model. 

Not all genres pairs exhibit a tradeoff such that population accuracy is maximized by a combiniation of data from both genres: for some pairs it is advantageous to have data mostly or entirely from one genre.

### To run this notebook, you'll need to download the data files for all genres (see readme).

In [None]:

data_dir_goodreads ='../../data/goodreads' 
reviews_fn_history = 'goodreads_reviews_history_biography.json.gz'
reviews_romance = 'goodreads_reviews_romance.json.gz'

reviews_fn_poetry = 'goodreads_reviews_poetry.json'
reviews_fn_scify = 'goodreads_reviews_fantasy_paranormal.json.gz'
reviews_mystery = 'goodreads_reviews_mystery_thriller_crime.json.gz'

reviews_comics = 'goodreads_reviews_comics_graphic.json.gz'
reviews_children = 'goodreads_reviews_children.json.gz'

fn_history = os.path.join(data_dir_goodreads,reviews_fn_history)
fn_romance = os.path.join(data_dir_goodreads,reviews_romance)

fn_scify = os.path.join(data_dir_goodreads,reviews_fn_scify)
fn_poetry = os.path.join(data_dir_goodreads,reviews_fn_poetry)
fn_mystery = os.path.join(data_dir_goodreads,reviews_mystery)
fn_comics = os.path.join(data_dir_goodreads,reviews_comics)
fn_children = os.path.join(data_dir_goodreads,reviews_children)

In [None]:
# download as necessary

%time data_poetry = gm.parse_reviews(fn_poetry)
%time data_comics = gm.parse_reviews(fn_comics)
%time data_romance = gm.parse_reviews(fn_romance)
%time data_history = gm.parse_reviews(fn_history)
%time data_scify = gm.parse_reviews(fn_scify)

# not yet donwloaded:
%time data_children = gm.parse_reviews(fn_children)
%time data_scify = gm.parse_reviews(fn_scify)
%time data_mystery = gm.parse_reviews(fn_mystery)

In [None]:
genre_to_data = {
    'history': data_history,
    'mystery': data_mystery,
    'scify': data_scify,
    'comics':data_comics,
    'romance':data_romance,
    'childen':data_children,
    'poetry':data_poetry,
}

genres = list(genre_to_data.keys())

In [None]:
# MISC functions
def fit_lr(X,y,seed, weights=None):
    clf = sklearn.linear_model.LogisticRegression(random_state = 0,
                                              penalty='l2',
                                              C=1,
                                              solver='lbfgs',
                                              multi_class='multinomial',
                                              max_iter=400);
    clf.fit(X, y, sample_weight=weights)
    return clf


In [None]:
genres

In [None]:
# agggregate the data and save (if not already done)

num_books_to_consider = 100
n_per_genre = 20000
num_features = 2000


alphas = np.array([0.02,0.05,0.1,0.3,0.5,0.7,0.9,0.95,0.98])

total_size = int(0.8*0.8*n_per_genre)

subset_group_sizes = np.zeros((2,len(alphas)))

for i, alpha in enumerate(alphas):
    subset_group_sizes[0,i] = (total_size * alpha).astype(int)
    subset_group_sizes[1,i] = total_size - subset_group_sizes[0,i]

# keep self from accidentally redoing
instantiate_data = True
if instantiate_data:
    data_by_genre_pairs = [[] for i in range(len(genres))]

    for i, genre_1 in enumerate(genres):
        for j, genre_2 in enumerate(genres):
            print(i,j)
            if i >= j:
                data_by_genre_pairs[i].append([])

            else:
                genre_pair = [genre_1, genre_2]
                print(genre_pair)
                data_by_genre = [genre_to_data[x] for x in genre_pair]

                csv_filename = 'goodreads_{0}_{1}.csv'.format(genre_pair[0], genre_pair[1])
                data_these_genres = gm.aggregate_reviews(genre_pair, 
                                             data_by_genre,
                                             csv_filename,
                                             k=num_books_to_consider, 
                                             n_per_genre=n_per_genre)

                data_by_genre_pairs[i].append(data_these_genres)
           

In [None]:
import train_fxns_nonimage as m

lr_model_kwargs = {'penalty': 'l2', 'C':1.0, 'solver': 'lbfgs', 'max_iter':400}

results = [[] for i in range(len(genres))]
for i, genre_1 in enumerate(genres):
    for j, genre_2 in enumerate(genres):
        if i >= j:
            results[i].append([])
            
        else:
            print(genre_1, genre_2)
            data_both = data_by_genre_pairs[i][j]
            X_this, vectorizer = gm.tfidf_features(list(data_both['review_text']),  \
                                             max_features=num_features,\
                                             use_stopwords=False)
            data_both = data_both[['genre', 'fold', 'rating', 'review_text', 'cv_fold_0', 'cv_fold_1',
                                   'cv_fold_2', 'cv_fold_3', 'cv_fold_4']]

            data_both_no_reviews = data_both[['genre', 'fold', 'rating','cv_fold_0', 'cv_fold_1',
                                   'cv_fold_2', 'cv_fold_3', 'cv_fold_4']]

            # add x_ids_column
            data_both_no_reviews['X_idxs'] = np.arange(X_this.toarray().shape[0])
        
            r = m.cv_subset_and_train(data_both_no_reviews, \
                                      X_this.toarray(),
                                      group_key='genre',
                                      label_key='rating',
                                      subset_sizes=subset_group_sizes,
                                      pred_fxn = m.fit_logistic_regression,
                                      model_kwargs = lr_model_kwargs,
                                      num_seeds = 2)

            results[i].append(r)
            

In [None]:
# plot
from importlib import reload
reload(plotting)
fig, ax = plt.subplots(6,6, figsize=(24,30))
acc_key = 'mae'
for i, genre_1 in enumerate(genres):
    for j, genre_2 in enumerate(genres):
        if i >= j:
            continue
        else:
            
            genre_pair = [genre_1, genre_2]
            
            subset_fracs = subset_group_sizes / subset_group_sizes.sum(axis=0)

            genre_id_dict = {}
            for k in range(2):
                genre_id_dict[k] = genre_pair[k]

            cv_accs_by_group, cv_accs_total = results[i][j]
            # save in the expected data format.
            plotting.plot_by_group(cv_accs_by_group,
                                   cv_accs_total,
                                   subset_fracs,
                                   acc_key,
                                   genre_id_dict,
                                   label_append = ' (ERM)',
                                   range_type='minmax', 
                                   title='{0} : {1}'.format(genre_1, genre_2),
                                   ax=ax[i,j-1],
                                   ylim=None,
                                   legend=False)
            

# subset and make u plot from the training data

In [None]:
# further explore history and comics

num_books_to_consider = 100
n_per_genre = 50000

genre_pair = ['history', 'scify']
data_by_genre = [genre_to_data[x] for x in genre_pair]

csv_filename = 'goodreads_{0}_{1}.csv'.format(genre_pair[0], genre_pair[1])
data_these_genres = gm.aggregate_reviews(genre_pair, 
                                         data_by_genre,
                                         csv_filename,
                                         k=num_books_to_consider, 
                                         n_per_genre=n_per_genre)

In [None]:
def fit_ridge(X,y,seed, weights=None):
    clf = sklearn.linear_model.Ridge(alpha=0.1, solver='svd');
    
    clf.fit(X, y, sample_weight=weights)
    return clf

In [None]:
def fit_rfr(X,y,seed, weights=None):
    clf = RandomForestRegressor(max_depth=10, n_estimators=400, random_state=seed,
                                    n_jobs=64)
    
    clf.fit(X, y, sample_weight=weights)
    return clf

In [None]:
data_both = data_these_genres
alphas = np.array([0.02,0.05,0.1,0.3,0.5,0.7,0.9,0.95,0.98])
total_size = int(0.8*0.8*n_per_genre)
subset_group_sizes = np.zeros((2,len(alphas)))

for i, alpha in enumerate(alphas):
    subset_group_sizes[0,i] = (total_size * alpha).astype(int)
    subset_group_sizes[1,i] = total_size - subset_group_sizes[0,i]


    
X_this, vectorizer = gm.tfidf_features(list(data_both['review_text']),  \
                                             max_features=num_features,\
                                             use_stopwords=False)

data_both = data_both[['genre', 'fold', 'rating', 'review_text', 'cv_fold_0', 'cv_fold_1',
                                   'cv_fold_2', 'cv_fold_3', 'cv_fold_4']]

data_both_no_reviews = data_both[['genre', 'fold', 'rating','cv_fold_0', 'cv_fold_1',
                                   'cv_fold_2', 'cv_fold_3', 'cv_fold_4']]

# add x_ids_column
data_both_no_reviews['X_idxs'] = np.arange(X_this.toarray().shape[0])
        
pred_fxn = fit_rfr
t1 = time.time()
r = m.cv_subset_and_train(data_both_no_reviews, \
                          X_this.toarray(),
                          group_key='genre',
                          label_key='rating',
                          subset_sizes=subset_group_sizes,
                          pred_fxn = pred_fxn,
                          num_seeds = 2)

t2 = time.time()
print('took {0:.1f} seconds'.format(t2-t1))

In [None]:
subset_fracs = subset_group_sizes / subset_group_sizes.sum(axis=0)

genre_id_dict = {}
for k in range(2):
    genre_id_dict[k] = genre_pair[k]

cv_accs_by_group, cv_accs_total = r
#cv_accs_by_group_IS, cv_accs_total_IS = r_IS
# save in the expected data format.

fig, ax = plt.subplots()
plotting.plot_by_group(cv_accs_by_group,
                                   cv_accs_total,
                                   subset_fracs,
                                   acc_key,
                                   genre_id_dict,
                                   label_append = ' (ERM)',
                                   range_type='minmax', 
                                   title='{0} : {1}'.format(genre_pair[0], genre_pair[1]),
                                   ax=ax,
                                   ylim=None,
                                   legend=True)

#plt.savefig('{0}_{1}.jpg'.format(genre_pair[0], genre_pair[1]))
