## Import Libraries

In [36]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import nltk
import re

## Load the Books

In [37]:
book_list = os.listdir("W:\CSE713-Advanced-Syntactic-Pattern-Recognition\Projects\Text Classification for Authorship Attribution\dataset_02_personally_collected_data\data_before_processing")
book_list

['charles_dickens-great_expectations.txt',
 'charles_dickens-oliver_twist.txt',
 'charles_dickens-sketches_of_young_couples.txt',
 'charles_dickens-somebodys_luggage.txt',
 'charles_dickens-the_mystery_of_edwin_drood.txt',
 'f_scott_fitzgerald-the_beautiful_and_damned.txt',
 'f_scott_fitzgerald-the_great_gatsby.txt',
 'f_scott_fitzgerald-this_side_of_paradise.txt',
 'george_eliot-middlemarch.txt',
 'george_eliot-silas_marner.txt',
 'george_eliot-the_mill_on_the_floss.txt',
 'james_joyce-a_portrait_of_the_artist_as_a_young_man.txt',
 'james_joyce-dubliners.txt',
 'james_joyce-ulysses.txt',
 'jane_austen-emma.txt',
 'jane_austen-pride_and_prejudice.txt',
 'jane_austen-sense_and_sensibility.txt',
 'joseph_conrad-heart_of_darkness.txt',
 'joseph_conrad-lord_jim.txt',
 'joseph_conrad-the_rover.txt',
 'mark_twain-adventures_of_huckleberry_finn.txt',
 'mark_twain-the_adventures_of_tom_sawyer_part_1.txt',
 'mark_twain-the_adventures_of_tom_sawyer_part_2.txt',
 'mark_twain-the_adventures_of_tom

In [38]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Creating Book Dataframe

In [53]:
import os
import pandas as pd

def parse_filename(filename):
    """
    Parse the filename to extract the author and book name.
    Assumes filename format is 'name-of-author_book-name.txt'
    """
    # Remove the file extension and split by the first underscore
    author, book_name = filename[:-4].split('-', 1)
    # Replace hyphens with spaces for author and book name
    author = author.replace('_', ' ')
    book_name = book_name.replace('_', ' ')
    return book_name, author

def process_files_for_metadata(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            book_name, author = parse_filename(filename)
            data.append({'Book Name': book_name, 'Author': author})
    return data

def create_metadata_dataframe(data):
    """
    Create a dataframe from the provided metadata list
    """
    return pd.DataFrame(data)

# Change 'your-directory-path' to the path of the directory containing your txt files
directory_path = 'W:\CSE713-Advanced-Syntactic-Pattern-Recognition\Projects\Text Classification for Authorship Attribution\dataset_02_personally_collected_data\data_before_processing'
metadata = process_files_for_metadata(directory_path)
metadata_df = create_metadata_dataframe(metadata)
print(metadata_df.head())
metadata_df.to_csv('../dataset_02_personally_collected_data/data_after_processing/books_df.csv', index=False)  # Save the dataframe to a CSV file


                    Book Name           Author
0          great expectations  charles dickens
1                oliver twist  charles dickens
2   sketches of young couples  charles dickens
3           somebodys luggage  charles dickens
4  the mystery of edwin drood  charles dickens


### Creating Sentence Dataframe

In [54]:
import os
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def extract_author(filename):
    """
    Extract the author's name from the filename.
    Assumes filename format is 'name-of-author_book-name.txt'
    """
    return filename.split('_')[0].replace('-', ' ')

def process_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            author = extract_author(filename)
            path = os.path.join(directory, filename)
            with open(path, 'r', encoding='utf-8') as file:
                text = file.read()
                sentences = sent_tokenize(text)
                if len(sentences) > 200:
                    sampled_sentences = sentences[:200]
                else:
                    sampled_sentences = sentences
                for sentence in sampled_sentences:
                    data.append({'Text': sentence, 'Label': author})
    return data

def create_dataframe(data):
    """
    Create a dataframe from the provided data list
    """
    return pd.DataFrame(data)

# Change 'your-directory-path' to the path of the directory containing your txt files
directory_path = '../dataset_02_personally_collected_data\data_before_processing'
data = process_files(directory_path)
df = create_dataframe(data)
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                Text    Label
0                                        ﻿Chapter I.  charles
1  My father’s family name being Pirrip, and my C...  charles
2  So, I called myself Pip, and came to be called...  charles
3  I give Pirrip as my father’s family name, on t...  charles
4           Joe Gargery, who married the blacksmith.  charles


In [47]:
#checking first sentences
df["Text"][1]

'My father’s family name being Pirrip, and my Christian name Philip, my\ninfant tongue could make of both names nothing longer or more explicit\nthan Pip.'

In [46]:
df["Label"].unique()

array(['charles', 'f', 'george', 'james', 'jane', 'joseph', 'mark',
       'oscar', 'virginia', 'william'], dtype=object)

In [48]:
df.to_csv('../dataset_02_personally_collected_data/data_after_processing/', index=False)  # Save the dataframe to a CSV file