In [68]:
import os
import torch
import numpy as np
import pandas as pd
from joblib import load
import transformers as ppb
from collections import namedtuple
from typing import List, Dict, Tuple
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from joblib import dump, load

In [69]:
#import tensorflow as tf
# Get the GPU device name.
#device_name = tf.test.gpu_device_name()
# The device name should look like the following:
#if device_name == '/device:GPU:0':
#    print('Found GPU at: {}'.format(device_name))
#else:
#    raise SystemError('GPU device not found')

# If there's a GPU available...
#if torch.cuda.is_available():      
#    device = torch.device("cuda")
#    print('There are %d GPU(s) available.' % torch.cuda.device_count())
#    print('We will use the GPU:', torch.cuda.get_device_name(0))
#else:
#print('No GPU available, using the CPU instead.')
#device = torch.device("cpu")

In [70]:
#import bert model 
bert_model = "distilbert-base-uncased"
tokenizer = ppb.DistilBertTokenizer.from_pretrained(bert_model)
model = ppb.DistilBertModel.from_pretrained(bert_model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
df = pd.read_csv('final_df.csv')

In [57]:
#select fiction genres
df = pd.read_csv('final_df.csv')
df_fiction = df[['book','genres_Fiction']]

pos_sample = df[df['genres_Fiction'] == True].sample(2500)
neg_sample = df[df['genres_Fiction'] == False].sample(2500)
goodreads_for_tokenizer = pd.concat([pos_sample, neg_sample])

#drop unnecessary columns
goodreads_for_tokenizer = goodreads_for_tokenizer[['book','genres_Fiction']].reset_index(drop = True)
goodreads_for_tokenizer.head(2)

Unnamed: 0,book,genres_Fiction
0,The Pearl/The Red Pony THE PEARLBased on an ol...,True
1,Rachel's Holiday Meet Rachel Walsh. She has a ...,True


In [25]:
#tokenize
def tokenize_book(input_sentence):
    input_ids = torch.tensor(tokenizer.encode(input_sentence, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    featurized_text = outputs[0][:,0,:].detach().numpy()
    return featurized_text

#to pad:
def padder(tokenized):
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    return np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [27]:
#tokenize and pad:
tokenized = goodreads_for_tokenizer['book'].apply((lambda x: tokenizer.encode(x[:512], add_special_tokens=True)))
padded = padder(tokenized)


(5000, 510)

In [63]:
batch_size = 32

tokenized = goodreads_for_tokenizer['book'].apply((lambda x: tokenizer.encode(x[:512], add_special_tokens=True)))
padded = padder(tokenized)

# Create the DataLoaders for our training
train_dataloader = DataLoader(
            padded,  # The training samples.
            sampler = SequentialSampler(padded), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

feature = []

with torch.no_grad(): 
  for d in train_dataloader:
    input_ids = torch.tensor(np.array(d))
    attention_mask = np.where(d != 0, 1, 0)
    attention_mask = torch.tensor(attention_mask) 

    last_hidden_states = model(input_ids,attention_mask=attention_mask) 
    feature.extend(last_hidden_states[0][:,0,:].numpy())
del d

labels = goodreads_for_tokenizer['genres_Fiction']

In [66]:
#train logistic regression based on fiction genres
train_features, test_features, train_labels, test_labels = train_test_split(train_x, train_y)
lr_clf = LogisticRegression(max_iter=5000)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [67]:
lr_clf.score(test_features, test_labels)

0.868

In [72]:
#f1 score model:
f1_score(lr_clf.predict(test_features), test_labels)

0.8687350835322195

In [71]:
#check all genres
genres = ['Literature','Fiction','Classics','Nonfiction','Historical','Novels','Fantasy','Childrens','Cultural','Mystery',
         'History','Biography','Religion','European Literature','Thriller','Romance','Humor','Contemporary','Young Adult',
         'Philosophy']

In [74]:
#wrap up codes and apply on all genres
batch_size = 32
for genre in genres:
    #sample 2500 datapoints and balance the dataset
    pos_sample = df[df['genres_{}'.format(genre)] == True].sample(2500)
    neg_sample = df[df['genres_{}'.format(genre)] == False].sample(2500)
    goodreads_for_tokenizer = pd.concat([pos_sample, neg_sample])

    #drop unnecessary columns
    goodreads_for_tokenizer = goodreads_for_tokenizer[['book','genres_{}'.format(genre)]].reset_index(drop = True)

    #tokenize and pad:
    tokenized = goodreads_for_tokenizer['book'].apply((lambda x: tokenizer.encode(x[:512], add_special_tokens=True)))
    padded = padder(tokenized)

    # Create the DataLoaders for our training 
    train_dataloader = DataLoader(
            padded,  # The training samples.
            sampler = SequentialSampler(padded), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

    feature = []

    with torch.no_grad(): 
      for d in train_dataloader:
        input_ids = torch.tensor(np.array(d))
        attention_mask = np.where(d != 0, 1, 0)
        attention_mask = torch.tensor(attention_mask) 

        last_hidden_states = model(input_ids,attention_mask=attention_mask) 
        feature.extend(last_hidden_states[0][:,0,:].numpy())
      del d

    #genre-specific labels:
    labels = goodreads_for_tokenizer['genres_{}'.format(genre)]
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
    lr_clf = LogisticRegression(max_iter=5000)
    lr_clf.fit(train_features, train_labels) #fit the model
    #return a score:
    print(genre + " Accuracy Score= " + str(lr_clf.score(test_features, test_labels)))
    print(genre + " F1 Score= " + str(f1_score(lr_clf.predict(test_features), test_labels)))

ValueError: ignored