# Author Attribution Using Stylometry

## Setup

In [None]:
%pip install transformers[torch]
%pip install 'accelerate>=0.26.0'
%pip install nltk
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn
%pip install torch

In [2]:
import os
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
import torch
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, Trainer, TrainingArguments

### Download NLTK Resources

In [None]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

## Data Loader

### Get File IDs

In [None]:
def get_gutenberg_fileids():
    """Get the list of file IDs from the Gutenberg corpus"""
    file_ids = gutenberg.fileids()
    
    return file_ids

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Test get_gutenberg_fileids()

In [19]:
get_gutenberg_fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Extract Authors

In [21]:
def extract_authors(file_ids):
    """Extract author names from file IDs"""
    authors = {}
    for file_id in file_ids:
        author = file_id.split('-')[0]
        authors[file_id] = author
    return authors

#### Test extract_authors()

In [20]:
extract_authors(get_gutenberg_fileids())

{'austen-emma.txt': 'austen',
 'austen-persuasion.txt': 'austen',
 'austen-sense.txt': 'austen',
 'bible-kjv.txt': 'bible',
 'blake-poems.txt': 'blake',
 'bryant-stories.txt': 'bryant',
 'burgess-busterbrown.txt': 'burgess',
 'carroll-alice.txt': 'carroll',
 'chesterton-ball.txt': 'chesterton',
 'chesterton-brown.txt': 'chesterton',
 'chesterton-thursday.txt': 'chesterton',
 'edgeworth-parents.txt': 'edgeworth',
 'melville-moby_dick.txt': 'melville',
 'milton-paradise.txt': 'milton',
 'shakespeare-caesar.txt': 'shakespeare',
 'shakespeare-hamlet.txt': 'shakespeare',
 'shakespeare-macbeth.txt': 'shakespeare',
 'whitman-leaves.txt': 'whitman'}

### Get Text Samples

In [7]:
def get_samples(file_ids, chunk_size=2000, overlap=0):
    """
    Extract text samples from the corpus.
    
    Args:
        chunk_size: Size of each text sample in words
        overlap: Number of words to overlap between consecutive chunks
        
    Returns:
        List of (text, author) tuples
    """
    samples = []
    
    for file_id in file_ids:
        text = gutenberg.raw(file_id)
        
        # Clean text by removing metadata (usually at the beginning and end)
        # This is a simple approach - might need refinement for specific corpora
        lines = text.split('\n')
        start_idx = 0
        end_idx = len(lines)
        
        # Find start of main text (skip headers)
        for i, line in enumerate(lines):
            if "*** START OF" in line:
                start_idx = i + 1
                break
                
        # Find end of main text (skip footers)
        for i, line in enumerate(lines[::-1]):
            if "*** END OF" in line:
                end_idx = len(lines) - i - 1
                break
                
        clean_text = ' '.join(lines[start_idx:end_idx])
        
        # Split into chunks
        words = word_tokenize(clean_text)
        step = chunk_size - overlap
        
        for i in range(0, len(words) - chunk_size + 1, step):
            chunk = ' '.join(words[i:i+chunk_size])
            samples.append((chunk, file_id))
            
    return samples

#### Test get_samples()

In [24]:
file_ids = ['blake-poems.txt', 'shakespeare-macbeth.txt']
samples = get_samples(file_ids, chunk_size=2000, overlap=100)
print(f"Samples: {samples[:2]}")
print(f"Extracted {len(samples)} samples from {len(file_ids)} files.")

Samples: [("[ Poems by William Blake 1789 ] SONGS OF INNOCENCE AND OF EXPERIENCE and THE BOOK of THEL SONGS OF INNOCENCE INTRODUCTION Piping down the valleys wild , Piping songs of pleasant glee , On a cloud I saw a child , And he laughing said to me : `` Pipe a song about a Lamb ! '' So I piped with merry cheer . `` Piper , pipe that song again ; '' So I piped : he wept to hear . `` Drop thy pipe , thy happy pipe ; Sing thy songs of happy cheer : ! '' So I sang the same again , While he wept with joy to hear . `` Piper , sit thee down and write In a book , that all may read . '' So he vanish 'd from my sight ; And I pluck 'd a hollow reed , And I made a rural pen , And I stain 'd the water clear , And I wrote my happy songs Every child may joy to hear . THE SHEPHERD How sweet is the Shepherd 's sweet lot ! From the morn to the evening he stays ; He shall follow his sheep all the day , And his tongue shall be filled with praise . For he hears the lambs ' innocent call , And he hears th

## Style Feature Extractor

### Character Features

In [None]:
def extract_character_features(text):
    """Extract character-level features"""
    features = {}
    
    # Average word length
    words = word_tokenize(text.lower())
    if words:
        features['avg_word_length'] = sum(len(word) for word in words) / len(words)
    else:
        features['avg_word_length'] = 0
        
    # Punctuation frequency
    punct_count = sum(1 for char in text if char in ".,;:!?-\"'()[]{}")
    features['punct_freq'] = punct_count / len(text) if len(text) > 0 else 0
    
    # Uppercase frequency
    uppercase_count = sum(1 for char in text if char.isupper())
    features['uppercase_freq'] = uppercase_count / len(text) if len(text) > 0 else 0
    
    # Digit frequency
    digit_count = sum(1 for char in text if char.isdigit())
    features['digit_freq'] = digit_count / len(text) if len(text) > 0 else 0
    
    # Character n-grams (bigrams and trigrams)
    char_bigrams = [text[i:i+2] for i in range(len(text)-1)]
    char_trigrams = [text[i:i+3] for i in range(len(text)-2)]
    
    # Top 5 character bigrams and trigrams
    if char_bigrams:
        bigram_counts = Counter(char_bigrams)
        for i, (bigram, _) in enumerate(bigram_counts.most_common(5)):
            features[f'top_char_bigram_{i+1}'] = bigram
            
    if char_trigrams:
        trigram_counts = Counter(char_trigrams)
        for i, (trigram, _) in enumerate(trigram_counts.most_common(5)):
            features[f'top_char_trigram_{i+1}'] = trigram

    return features

#### Test extract_character_features()

In [26]:
file_ids = ['blake-poems.txt', 'shakespeare-macbeth.txt']
samples = get_samples(file_ids, chunk_size=2000, overlap=100)
texts, authors = zip(*samples)
features = [extract_character_features(text) for text in texts]
print(f"Features: {features[:2]}")
print(f"Extracted features for {len(features)} samples.")

Features: [{'avg_word_length': 3.4495, 'punct_freq': 0.04472915261856597, 'uppercase_freq': 0.06316026073274893, 'digit_freq': 0.0004495392222971454, 'top_char_bigram_1': 'e ', 'top_char_bigram_2': 'he', 'top_char_bigram_3': ' ,', 'top_char_bigram_4': ', ', 'top_char_bigram_5': 'd ', 'top_char_trigram_1': ' , ', 'top_char_trigram_2': ' th', 'top_char_trigram_3': 'the', 'top_char_trigram_4': 'nd ', 'top_char_trigram_5': 'he '}, {'avg_word_length': 3.564, 'punct_freq': 0.04360688068368577, 'uppercase_freq': 0.055878163690150105, 'digit_freq': 0.0, 'top_char_bigram_1': 'e ', 'top_char_bigram_2': 'he', 'top_char_bigram_3': ' ,', 'top_char_bigram_4': ', ', 'top_char_bigram_5': 'th', 'top_char_trigram_1': ' , ', 'top_char_trigram_2': ' th', 'top_char_trigram_3': 'the', 'top_char_trigram_4': 'he ', 'top_char_trigram_5': 'nd '}]
Extracted features for 15 samples.


### Word Features

In [9]:
def extract_word_features(text):
    """Extract word-level features"""
    features = {}
    words = word_tokenize(text.lower())
    
    if not words:
        return {'vocab_richness': 0, 'avg_sentence_length': 0}
    
    # Vocabulary richness (type-token ratio)
    features['vocab_richness'] = len(set(words)) / len(words)
    
    # Average sentence length
    sentences = sent_tokenize(text)
    if sentences:
        words_per_sentence = [len(word_tokenize(s)) for s in sentences]
        features['avg_sentence_length'] = sum(words_per_sentence) / len(sentences)
    else:
        features['avg_sentence_length'] = 0
        
    # Function word usage (common words like 'the', 'and', 'of', etc.)
    function_words = ['the', 'and', 'of', 'to', 'a', 'in', 'that', 'is', 'was', 'for']
    for word in function_words:
        features[f'freq_{word}'] = words.count(word) / len(words)
        
    return features

# extract_word_features("This is a test string. It contains some punctuation, numbers 123, and uppercase letters. The quick brown fox jumps over the lazy dog.")

#### Test extract_word_features()

In [27]:
file_ids = ['blake-poems.txt', 'shakespeare-macbeth.txt']
samples = get_samples(file_ids, chunk_size=2000, overlap=100)
texts, authors = zip(*samples)
features = [extract_word_features(text) for text in texts]
print(f"Features: {features[:2]}")
print(f"Extracted features for {len(features)} samples.")

Features: [{'vocab_richness': 0.2765, 'avg_sentence_length': 25.641025641025642, 'freq_the': 0.047, 'freq_and': 0.0465, 'freq_of': 0.015, 'freq_to': 0.0115, 'freq_a': 0.0155, 'freq_in': 0.013, 'freq_that': 0.0045, 'freq_is': 0.0065, 'freq_was': 0.0065, 'freq_for': 0.0065}, {'vocab_richness': 0.3085, 'avg_sentence_length': 20.61855670103093, 'freq_the': 0.0475, 'freq_and': 0.0375, 'freq_of': 0.015, 'freq_to': 0.0105, 'freq_a': 0.0125, 'freq_in': 0.014, 'freq_that': 0.005, 'freq_is': 0.0075, 'freq_was': 0.0005, 'freq_for': 0.0055}]
Extracted features for 15 samples.


### Syntax Features

In [10]:
def extract_syntax_features(text):
    """Extract syntax-level features"""
    features = {}
    
    # POS tag frequencies
    words = word_tokenize(text)
    if not words:
        return {'noun_freq': 0, 'verb_freq': 0, 'adj_freq': 0, 'adv_freq': 0}
        
    pos_tags = pos_tag(words)
    pos_counts = Counter(tag for _, tag in pos_tags)
    
    # Calculate frequencies of main POS categories
    total_tags = len(pos_tags)
    features['noun_freq'] = sum(pos_counts[tag] for tag in pos_counts if tag.startswith('NN')) / total_tags
    features['verb_freq'] = sum(pos_counts[tag] for tag in pos_counts if tag.startswith('VB')) / total_tags
    features['adj_freq'] = sum(pos_counts[tag] for tag in pos_counts if tag.startswith('JJ')) / total_tags
    features['adv_freq'] = sum(pos_counts[tag] for tag in pos_counts if tag.startswith('RB')) / total_tags
    
    return features

# extract_syntax_features("This is a test string. It contains some punctuation, numbers 123, and uppercase letters. The quick brown fox jumps over the lazy dog.")

#### Test extract_syntax_features()

In [28]:
file_ids = ['blake-poems.txt', 'shakespeare-macbeth.txt']
samples = get_samples(file_ids, chunk_size=2000, overlap=100)
texts, authors = zip(*samples)
features = [extract_syntax_features(text) for text in texts]
print(f"Features: {features[:2]}")
print(f"Extracted features for {len(features)} samples.")

Features: [{'noun_freq': 0.268, 'verb_freq': 0.1175, 'adj_freq': 0.0675, 'adv_freq': 0.029}, {'noun_freq': 0.281, 'verb_freq': 0.1385, 'adj_freq': 0.046, 'adv_freq': 0.0365}]
Extracted features for 15 samples.


### All Features

In [None]:
def extract_all_features(text):
    """Extract all stylometric features from text"""
    character_features = extract_character_features(text)
    word_features = extract_word_features(text)
    syntax_features = extract_syntax_features(text)
    
    # Combine all features
    all_features = {**character_features, **word_features, **syntax_features}
        
    return all_features

#### Test extract_all_features()

In [29]:
file_ids = ['blake-poems.txt', 'shakespeare-macbeth.txt']
samples = get_samples(file_ids, chunk_size=2000, overlap=100)
texts, authors = zip(*samples)
features = [extract_all_features(text) for text in texts]
print(f"Features: {features[:2]}")
print(f"Extracted features for {len(features)} samples.")

Features: [{'avg_word_length': 3.4495, 'punct_freq': 0.04472915261856597, 'uppercase_freq': 0.06316026073274893, 'digit_freq': 0.0004495392222971454, 'top_char_bigram_1': 'e ', 'top_char_bigram_2': 'he', 'top_char_bigram_3': ' ,', 'top_char_bigram_4': ', ', 'top_char_bigram_5': 'd ', 'top_char_trigram_1': ' , ', 'top_char_trigram_2': ' th', 'top_char_trigram_3': 'the', 'top_char_trigram_4': 'nd ', 'top_char_trigram_5': 'he ', 'vocab_richness': 0.2765, 'avg_sentence_length': 25.641025641025642, 'freq_the': 0.047, 'freq_and': 0.0465, 'freq_of': 0.015, 'freq_to': 0.0115, 'freq_a': 0.0155, 'freq_in': 0.013, 'freq_that': 0.0045, 'freq_is': 0.0065, 'freq_was': 0.0065, 'freq_for': 0.0065, 'noun_freq': 0.268, 'verb_freq': 0.1175, 'adj_freq': 0.0675, 'adv_freq': 0.029}, {'avg_word_length': 3.564, 'punct_freq': 0.04360688068368577, 'uppercase_freq': 0.055878163690150105, 'digit_freq': 0.0, 'top_char_bigram_1': 'e ', 'top_char_bigram_2': 'he', 'top_char_bigram_3': ' ,', 'top_char_bigram_4': ', ',

## Transformer Feature Extractor

In [33]:
def extract_embeddings(samples, max_length=512):
    """
    Extract embeddings from transformer model.
    
    Args:
        samples: List of text samples
        max_length: Maximum token length for the model
        
    Returns:
        Numpy array of embeddings
    """
    model_name='bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    embeddings = []
    
    for sample in samples:
        # Tokenize and prepare input
        inputs = tokenizer(sample, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
        
        # Get model outputs
        with torch.no_grad():
            outputs = model(**inputs)
            
        # Use the [CLS] token embedding (first token) as the text representation
        # Shape: [batch_size, hidden_size]
        embedding = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(embedding.flatten())
        
    return np.array(embeddings)

#### Test extract_embeddings()

In [34]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10) # 7min
embeddings = extract_embeddings(samples)
print("Embeddings shape: ", embeddings.shape)

Embeddings shape:  (761, 1536)


## Author Attributor

### Feature Based
Author attribution pipeline using feature-based stylometry.

In [16]:
def prepare_data(samples):
    """
    Prepare data for training.
    
    Args:
        samples: List of (text, author) tuples
        
    Returns:
        feature_matrix: Feature matrix
        labels: Target labels
        feature_names: List of feature names
        author_to_idx: Mapping from author names to indices
    """
    texts, authors = zip(*samples)
    
    # Create label mapping
    unique_authors = sorted(set(authors))
    author_to_idx = {author: i for i, author in enumerate(unique_authors)}
    labels = np.array([author_to_idx[author] for author in authors])
    
    # Extract features
    features = [extract_all_features(text) for text in texts]
    feature_names = list(features[0].keys())
    # feature_matrix = np.array([[feat[name] for name in feature_names] for feat in features])
    feature_matrix = np.array([[float(feat.get(name, 0)) if str(feat.get(name, 0)).replace('.', '', 1).isdigit() else 0 for name in feature_names] for feat in features])

    # return feature_matrix, labels, author_to_idx
    return feature_matrix, labels, feature_names, author_to_idx

#### Test prepare_data()

In [35]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10)
feature_matrix, labels, feature_names, author_to_idx = prepare_data(samples)
print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Feature names: {feature_names[:5]}")
print(f"Author to index mapping: {author_to_idx}")

Feature matrix shape: (761, 30)
Labels shape: (761,)
Feature names: ['avg_word_length', 'punct_freq', 'uppercase_freq', 'digit_freq', 'top_char_bigram_1']
Author to index mapping: {'blake-poems.txt': 0, 'shakespeare-macbeth.txt': 1}


In [17]:
def train(feature_matrix, labels):
    """
    Train the author attribution model.
    
    Args:
        feature_matrix: Feature matrix
        labels: Target labels
    """
    X_train, X_val, y_train, y_val = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)
    
    # Try both SVM and Random Forest
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(kernel='rbf', probability=True))
    ])
    
    rf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # Find best hyperparameters for SVM
    svm_param_grid = {
        'svm__C': [0.1, 1, 10, 100],
        'svm__gamma': ['scale', 'auto', 0.1, 0.01]
    }
    svm_grid = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='f1_macro')
    svm_grid.fit(X_train, y_train)
    
    # Find best hyperparameters for Random Forest
    rf_param_grid = {
        'rf__n_estimators': [50, 100],
        'rf__max_depth': [None, 10, 20]
    }
    rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='f1_macro')
    rf_grid.fit(X_train, y_train)
    
    # Compare both models
    print(f"SVM best score: {svm_grid.best_score_}")
    print(f"Random Forest best score: {rf_grid.best_score_}")
    
    # Select the better model
    if svm_grid.best_score_ >= rf_grid.best_score_:
        model = svm_grid.best_estimator_
        print(f"Selected SVM with params: {svm_grid.best_params_}")
    else:
        model = rf_grid.best_estimator_
        print(f"Selected Random Forest with params: {rf_grid.best_params_}")
    
    # Evaluate on validation set
    y_pred = model.predict(X_val)
    print("\nValidation Set Performance:")
    print(classification_report(y_val, y_pred))
    
    return model

#### Test train()

In [36]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10)
feature_matrix, labels, feature_names, author_to_idx = prepare_data(samples)
model = train(feature_matrix, labels)

SVM best score: 0.8300702693644911
Random Forest best score: 0.8273241673068202
Selected SVM with params: {'svm__C': 10, 'svm__gamma': 0.01}

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.81      0.64      0.71        39
           1       0.89      0.95      0.92       114

    accuracy                           0.87       153
   macro avg       0.85      0.79      0.81       153
weighted avg       0.87      0.87      0.86       153



In [37]:
def predict(model, feature_matrix):
    """
    Predict authors for texts.
    
    Args:
        feature_matrix: Feature matrix for texts
        
    Returns:
        Predicted author indices
    """

    return model.predict(feature_matrix)

#### Test predict()

In [38]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10)
feature_matrix, labels, feature_names, author_to_idx = prepare_data(samples)
model = train(feature_matrix, labels)
predictions = predict(model, feature_matrix)
print('Predictions:', predictions)

SVM best score: 0.8300702693644911
Random Forest best score: 0.8273241673068202
Selected SVM with params: {'svm__C': 10, 'svm__gamma': 0.01}

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.81      0.64      0.71        39
           1       0.89      0.95      0.92       114

    accuracy                           0.87       153
   macro avg       0.85      0.79      0.81       153
weighted avg       0.87      0.87      0.86       153

Predictions: [0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0
 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

### Transformer Based

In [41]:
def prepare_data(samples):
    """
    Prepare data for training.
    
    Args:
        samples: List of (text, author) tuples
        
    Returns:
        embeddings: Embeddings from transformer model
        labels: Target labels
        feature_names: List of feature names
        author_to_idx: Mapping from author names to indices
    """
    texts, authors = zip(*samples)
    
    # Create label mapping
    unique_authors = sorted(set(authors))
    author_to_idx = {author: i for i, author in enumerate(unique_authors)}
    labels = np.array([author_to_idx[author] for author in authors])
    # Convert labels to torch tensors
    labels = torch.tensor(labels)
    
    # Extract embeddings
    embeddings = extract_embeddings(texts)
    
    return embeddings, labels, author_to_idx

#### Test prepare_data()

In [42]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10)
embeddings, labels, author_to_idx = prepare_data(samples)
print(f"Embeddings shape: {embeddings.shape}")

Embeddings shape: (761, 768)


In [None]:
def train(embeddings, labels):

    
    return model

#### Test train()

In [None]:
samples = get_samples(['blake-poems.txt', 'shakespeare-macbeth.txt'], chunk_size=50, overlap=10)
embeddings, labels, author_to_idx = prepare_data(samples)
model = train(embeddings, labels)