# Writing Style Authors Selection

Before generation process we select ten authors from the dataset. We look for novelists with at least 200k words available.

In [1]:
import os
import pandas as pd
import plotly.express as px
from src import Settings, FileUtils

In [2]:
settings = Settings()

In [4]:
authors = open(settings.paths.ws_all_authors_filepath, 'r', encoding='utf-8').read().split('\n')
authors[:5]

['Abraham Lincoln',
 'Agatha Christie',
 'Albert Einstein',
 'Aldous Huxley',
 'Alexander Pope']

In [7]:
df = pd.DataFrame(columns=['author', 'book', 'words_count'])

for author_name, books in FileUtils.read_books(authors, settings.paths.ws_raw_books_dir).items():
    for book in books:
        words_count = len(book['content'].split())
        df.loc[len(df)] = [author_name, book["title"], words_count]

df.head()

Unnamed: 0,author,book,words_count
0,Abraham Lincoln,Lincoln Letters,1065
1,Abraham Lincoln,Lincoln's First Inaugural Address,3626
2,Abraham Lincoln,"Lincoln's Gettysburg Address, given November 1...",299
3,Abraham Lincoln,"Lincoln's Inaugurals, Addresses and Letters (S...",43640
4,Abraham Lincoln,Lincoln's Second Inaugural Address,703


In [8]:
df.shape

(3037, 3)

In [10]:
df.to_csv(settings.paths.ws_all_books_csv_filepath, index=False)

In [11]:
def get_df_grouped_by_author(df, skip_first_row=False):
    df_total_sorted = df.groupby('author')['words_count'].sum().sort_values(ascending=False).reset_index()
    # Remove the first row, because it's the sum of all words of all authors
    return df_total_sorted[1:] if skip_first_row else df_total_sorted

In [12]:
df_total_sorted = get_df_grouped_by_author(df, skip_first_row=True)
df_total_sorted.head()

Unnamed: 0,author,words_count
1,Anthony Trollope,9407287
2,Charlotte Mary Yonge,6584920
3,Charles Dickens,5725754
4,R M Ballantyne,5596709
5,James Fenimore Cooper,5426171


In [13]:
num_of_authors_per_figure = 30
num_of_figures = len(df_total_sorted) // num_of_authors_per_figure + 1

In [14]:
for i in range(num_of_figures):
    df_part = df[df['author'].isin(df_total_sorted['author'][i*num_of_authors_per_figure:(i+1)*num_of_authors_per_figure])]
    fig = px.bar(df_part, x='author', y='words_count', color='book', barmode = 'stack').update_xaxes(categoryorder='total descending')
    fig.update_layout(showlegend=False)
    fig.show()

## Insights

As the reference we can use "Federalists Papers" used at "Inference in an authorship problem" study of Frederick Mosteller and David L. Wallace. Analyzed text together accounts for around 180 000 words. 

Choose 10 authors to be analyzed:
- At least 200 000 total words count in the dataset
- Stick to novel writings

Below we can find the list of the chosen authors and exluded books that may not fit into the novel category.

In [15]:
authors_10 = {                                              # Author : [Exluded books]                                                        
    "Joseph Conrad": [],   
    "Zane Grey": [],
    "William Henry Hudson": [],      
    "Benjamin Disraeli": [  
        "Count Alarcos"   
    ],                            
    "Lucy Maud Montgomery": [                               
        "Lucy Maud Montgomery Short Stories, 1896 to 1901",
        "Lucy Maud Montgomery Short Stories, 1902 to 1903",
        "Lucy Maud Montgomery Short Stories, 1904",
        "Lucy Maud Montgomery Short Stories, 1905 to 1906",
        "Lucy Maud Montgomery Short Stories, 1907 to 1908",
        "Lucy Maud Montgomery Short Stories, 1909 to 1922",
    ],
    "Mark Twain": [
        "Mark Twain's Speeches",
        "Essays on Paul Bourget",
        "Extracts From Adam's Diary",
        "The $30,000 Bequest and Other Stories",
        "Is Shakespeare Dead_",
    ],
    "Lewis Carroll": [
        "A Tangled Tale",
        "Eight or Nine Wise Words about Letter-Writing",
        "Feeding the Mind",
        "Phantasmagoria and Other Poems",
        "Rhyme_ And Reason_",
        "Songs From Alice in Wonderland and Through the Looking-Glass",
        "Symbolic Logic",
        "The Game of Logic",
        "The Hunting of the Snark",
        "Three Sunsets and Other Poems",
    ],
    "Virginia Woolf": [],
    "George Eliot": [
        "How Lisa Loved the King",
        "The Essays of George Eliot"
    ],
    "Howard Pyle": [
        "The Story of Sir Launcelot and His Companions",
        "The Story of the Champions of the Round Table"
    ]
}

In [16]:
def filter_books(df, authors_books):
    for author, exluded_books in authors_books.items():
        df = df[~((df['author'] == author) & (df['book'].isin(exluded_books)))]
    return df

In [17]:
df_10 = df[df['author'].isin(authors_10.keys())]
df_10_filtered = filter_books(df_10, authors_10)
df_10_total_sorted = get_df_grouped_by_author(df_10_filtered)
df_10_total_sorted.head(10)

Unnamed: 0,author,words_count
0,Mark Twain,2567422
1,Zane Grey,2353362
2,Joseph Conrad,2141653
3,Benjamin Disraeli,1673671
4,George Eliot,1599204
5,William Henry Hudson,1060531
6,Lucy Maud Montgomery,874326
7,Howard Pyle,494769
8,Virginia Woolf,375131
9,Lewis Carroll,202514


In [20]:
selected_authors = df_10_total_sorted['author'].tolist()
os.makedirs(settings.paths.ws_selected_dir_path, exist_ok=True)
with open(settings.paths.ws_selected_authors_filepath, 'w', encoding='utf-8') as f:
    f.write('\n'.join(selected_authors))

In [21]:
df_10_filtered.to_csv(settings.paths.ws_selected_books_csv_filepath, index=False)