In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
np.random.seed(0)
segment_lengths = [1000, 5000, 10000]
os.chdir(top_dir)


In [6]:
for segment_length in tqdm(segment_lengths):
    print("\n################ RANKS FOR SEGMENT LENGTHS {} ################".format(segment_length))
    print(segment_length)
    authors, dates, titles, segment_numbers, texts = [], [], [], [], []
    for file in os.listdir("segments\\{}token_segments".format(segment_length)):
        if file.endswith(".txt"):
            authors.append(file.split('-')[0])
            dates.append(file.split('-')[1])
            titles.append(file.split('-')[2])
            segment_numbers.append(file.split('-')[3].split(".")[0])
            with open(top_dir+"segments\\{}token_segments\\".format(segment_length)+file, encoding='utf8') as f:
                contents = f.read()
                texts.append(contents)

    authors_segments = pd.DataFrame({
        'author': authors,
        'date':dates, 
        'title':titles,
        'segment_number': segment_numbers,
        'text':texts})

    # Convert segment number column to numeric so dataframe rows can be sorted
    authors_segments.segment_number = pd.to_numeric(authors_segments.segment_number)
    authors_segments = authors_segments.sort_values(by=['author', 'date', 'segment_number'])
    authors_features = authors_segments[['author', 
                                     'date', 
                                     'title', 
                                     'segment_number']]
    bachman_df = authors_features[authors_features['author'] == 'bachman'].reset_index()
    os.chdir(top_dir)
    ranks = pd.read_csv("bachman_segments_author_candidate_ranks_{}token_segments.csv".format(segment_length))
    ranks.set_index('target index', inplace=True)
    ranks_titles = ranks.join(bachman_df['title'])
    rank_index = [rank for rank in range(1,5)] * len(ranks_titles.title.unique())
    title_index = [title for title in ranks_titles.title.unique() for i in range(1,5)]
    arrays = [title_index, rank_index]
    tuples = list(zip(*arrays))
    authors = ('harris', 'king', 'koontz', 'straub')
    title_rank_index = pd.MultiIndex.from_tuples(tuples, names=["title", "rank"])
    title_author_rank_proportions = pd.DataFrame(columns = authors, index=title_rank_index)
    title_author_rank_counts = pd.DataFrame(columns = authors, index=title_rank_index)
    print("\n################ GETTING COUNTS AND PROPORTIONS OF RANKS ################")
        
    # For every backhman book
    for title in ranks_titles.title.unique():
        print(title)
        # For every rank 1-4
        for rank in range(1,5):
            # Create an empty row to fill
            proportion_row = []
            raw_count_row = [] 
            # For every candidate author
            for author in authors:
                # Get the number of iterations where a candidate was ranked 1st, 2nd, 3rd, etc.
                predicted_author_count = ranks_titles.loc[(ranks_titles[author] == rank) & (ranks_titles['title'] == title)].shape[0]
                # Divide by number of observations for that book (# segments * 1000 iterations per segment)
                predicted_author_proportion = predicted_author_count / ranks_titles.loc[(ranks_titles['title'] == title)].shape[0]
                # Add the proportion of iterations that predicted an author to have a certain rank
                proportion_row.append(predicted_author_proportion)
                raw_count_row.append(predicted_author_count)
            # After going through all iterations, add the row to the new dataframe
            title_author_rank_counts.loc[(title, rank), :] = raw_count_row
            title_author_rank_proportions.loc[(title, rank), :] = proportion_row
    title_author_rank_counts.groupby('rank').agg('sum').to_csv("predicted_author_candidate_raw_counts_{}token_segments.csv".format(segment_length))
    title_author_rank_proportions.to_csv("predicted_author_candidate_proportions_{}token_segments.csv".format(segment_length))

  0%|          | 0/3 [00:00<?, ?it/s]


################ RANKS FOR SEGMENT LENGTHS 1000 ################
1000

################ GETTING COUNTS AND PROPORTIONS OF RANKS ################
The_Long_Walk
Roadwork
Rage
Blaze
The_Running_Man
Thinner
The_Regulators


 33%|███▎      | 1/3 [04:14<08:29, 254.72s/it]


################ RANKS FOR SEGMENT LENGTHS 5000 ################
5000

################ GETTING COUNTS AND PROPORTIONS OF RANKS ################
The_Long_Walk
Roadwork
Rage
Blaze
The_Running_Man
Thinner
The_Regulators


 67%|██████▋   | 2/3 [05:02<02:12, 132.74s/it]


################ RANKS FOR SEGMENT LENGTHS 10000 ################
10000

################ GETTING COUNTS AND PROPORTIONS OF RANKS ################
The_Long_Walk
Roadwork
Rage
Blaze
The_Running_Man
Thinner
The_Regulators


100%|██████████| 3/3 [05:27<00:00, 109.32s/it]
