In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import *
from scipy.sparse import csr_matrix
import numpy as np
from scipy.spatial import distance
import numpy as np
from scipy.spatial.distance import cosine
import random

np.random.seed(0)
random.seed(0)

n_iterations = 10000
feature_proportion = 0.5

In [2]:
# Create a list of vectorizer settings - each one will create
# a new feature space from the corpus. 

vectorizers_settings = []
idf_settings = [True, False]
ngram_ranges = range(1,5)
analyzers = ['word', 'char']
for idf_setting in idf_settings: 
    for analyzer in analyzers: 
        for ngram_range in ngram_ranges: 
            vectorizer_setting= {'use_idf': idf_setting,
                  'max_features': 10000,
                  'analyzer': analyzer,
                  'min_df': 2,
                  'lowercase': True,
                  'norm': 'l1',
                  'ngram_range': (ngram_range, ngram_range)}
            vectorizers_settings.append(vectorizer_setting)

print(len(vectorizers_settings))
vectorizers_settings

16


[{'use_idf': True,
  'max_features': 10000,
  'analyzer': 'word',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (1, 1)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'word',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (2, 2)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'word',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (3, 3)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'word',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (4, 4)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'char',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (1, 1)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'char',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (2, 2)},
 {'use_idf': True,
  'max_features': 10000,
  'analyzer': 'char',
  'min_df': 2,
  'lowercase': True,
  'norm': 'l1',
  'ngram_range': (

In [4]:
# segment_lengths = [1000, 5000, 10000]
segment_lengths = [1000]

for segment_length in segment_lengths:
    
    print("################ LOADING SEGMENTS ################")
    print(segment_length)
    authors, dates, titles, segment_numbers, texts = [], [], [], [], []
    os.chdir(top_dir + "{}token_segments".format(segment_length))
    for file in os.listdir():
        if file.endswith(".txt"):
            authors.append(file.split('-')[0])
            dates.append(file.split('-')[1])
            titles.append(file.split('-')[2])
            segment_numbers.append(file.split('-')[3].split(".")[0])
            with open(file, encoding='utf8') as f:
                contents = f.read()
                texts.append(contents)
    
    authors_segments = pd.DataFrame({
        'author': authors,
        'date':dates, 
        'title':titles,
        'segment_number': segment_numbers,
        'text':texts})

    # Convert segment number column to numeric so dataframe rows can be sorted
    authors_segments.segment_number = pd.to_numeric(authors_segments.segment_number)
    authors_segments = authors_segments.sort_values(by=['author', 'date', 'segment_number'])
    print(authors_segments.head())
    
    authors_features = authors_segments[['author', 
                                     'date', 
                                     'title', 
                                     'segment_number']]
    # Iterate over vectorizer settings, make feature spaces,
    # append each new feature space to the dataframe authors_features. 
    # Number of rows = number of segments
    # Number of columns = number of columns created by 1st vectorizer + number of columns created by 2nd vectorizer + ... 
    feature_spaces = []
    
    print("\n################ GETTING SEGMENT FEATURE SPACES ################")
    for vectorizer_setting in tqdm(vectorizers_settings): 
        vectorizer = TfidfVectorizer(**vectorizer_setting)
        docterm_matrix = vectorizer.fit_transform(authors_segments.text).toarray()
        scaler = MinMaxScaler()
        feature_spaces.append(scaler.fit_transform(docterm_matrix))
    
    feature_spaces_array = np.hstack(feature_spaces)
    
    candidates = {}
    for author in ("koontz", "king", "straub", "harris"):
        candidates[author] = feature_spaces_array[authors_features['author'] == author]
    
    bachman = feature_spaces_array[authors_features['author'] == 'bachman']
    
    num_random_features = int(feature_spaces_array.shape[1]*feature_proportion)
    
    # Get distances of every Bachman segment to segments from other authors 10,000 times
    print("\n################ GETTING COSINE DISTANCES ################")
    results = []
    for idx, target_segment in tqdm(list(enumerate(bachman))): 
        for iteration in range(n_iterations):
            rand_feature_col_idxs = np.random.choice(feature_spaces_array.shape[1], num_random_features, replace=False)
            target_vector = target_segment[rand_feature_col_idxs]

            for candidate in candidates: 
                random_segment = candidates[candidate][np.random.choice(candidates[candidate].shape[0]), rand_feature_col_idxs]
                results.append((idx, iteration, candidate, cosine(target_vector, random_segment)))
    
    print("\n################ SAVING DISTANCES ################")
    df = pd.DataFrame(results, columns = ('target index', 'bootstrap iteration', 'candidate', 'distance'))
    print(df.head())
    os.chdir(top_dir)
    df.to_csv("bachman_segments_candidate_distance_{}tok.csv".format(segment_length), encoding='utf-8')
    
    print("\n################ SAVING RANKS ################")
    pivoted_df = df.pivot(index=['target index', 'bootstrap iteration'], columns='candidate')['distance']
    pivoted_df = pivoted_df.rank(1)
    pivoted_df.to_csv("bachman_segments_candidate_rank_{}tok.csv".format(segment_length), encoding='utf-8')

################ LOADING SEGMENTS ################
1000
     author     date          title  segment_number  \
0   bachman  1966.67  The_Long_Walk               0   
1   bachman  1966.67  The_Long_Walk               1   
12  bachman  1966.67  The_Long_Walk               2   
23  bachman  1966.67  The_Long_Walk               3   
34  bachman  1966.67  The_Long_Walk               4   

                                                 text  
0   part one starting out the long walk chapter 1 ...  
1   hand and waved the tears were flowing now he c...  
12  the major said sweeping them with the blank le...  
23  nt he had forgotten the major s fingers droppe...  
34  in garraty s belly that felt like a sticky bal...  

################ GETTING SEGMENT FEATURE SPACES ################


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [02:39<00:00,  9.99s/it]



################ GETTING COSINE DISTANCES ################


100%|█████████████████████████████████████████████████████████████████████████████| 603/603 [10:21:42<00:00, 61.86s/it]



################ SAVING DISTANCES ################
   target index  bootstrap iteration candidate  distance
0             0                    0    koontz  0.692278
1             0                    0      king  0.708261
2             0                    0    straub  0.676044
3             0                    0    harris  0.670621
4             0                    1    koontz  0.716662

################ SAVING RANKS ################
