In [25]:
import pandas as pd

# Step 1: Load the dataset
file_path = "datasets/project_gutenberg_catalog.csv"  # Replace with the actual path to your CSV file
df_gutenberg = pd.read_csv(file_path)

# Display the first few rows
print("Gutenberg Dataset Loaded:")
print(df_gutenberg.head())

# Display the column names
print("\nFields in the Gutenberg Dataset:")
for col in df_gutenberg.columns:
    print(col)


Gutenberg Dataset Loaded:
   Text#  Type      Issued                                              Title  \
0      1  Text  1971-12-01  The Declaration of Independence of the United ...   
1      2  Text  1972-12-01  The United States Bill of Rights\r\nThe Ten Or...   
2      3  Text  1973-11-01                John F. Kennedy's Inaugural Address   
3      4  Text  1973-11-01  Lincoln's Gettysburg Address\r\nGiven November...   
4      5  Text  1975-12-01                     The United States Constitution   

  Language                                        Authors  \
0       en                   Jefferson, Thomas, 1743-1826   
1       en                                  United States   
2       en  Kennedy, John F. (John Fitzgerald), 1917-1963   
3       en                    Lincoln, Abraham, 1809-1865   
4       en                                  United States   

                                            Subjects      LoCC  \
0  United States -- History -- Revolution, 1775-1...  

In [26]:
# Step 2: Check for missing values
print("\nMissing Values in Each Field:")
missing_values = df_gutenberg.isnull().sum()
print(missing_values)



Missing Values in Each Field:
Text#            0
Type             0
Issued           0
Title            0
Language         0
Authors        170
Subjects        58
LoCC           262
Bookshelves    122
dtype: int64


In [27]:
# Step 3: Remove unnecessary fields
fields_to_keep = ['Issued', 'Title', 'Language', 'Authors']
df_gutenberg_cleaned = df_gutenberg[fields_to_keep]

print("\nPreprocessed Gutenberg Dataset (Preview):")
print(df_gutenberg_cleaned.head())



Preprocessed Gutenberg Dataset (Preview):
       Issued                                              Title Language  \
0  1971-12-01  The Declaration of Independence of the United ...       en   
1  1972-12-01  The United States Bill of Rights\r\nThe Ten Or...       en   
2  1973-11-01                John F. Kennedy's Inaugural Address       en   
3  1973-11-01  Lincoln's Gettysburg Address\r\nGiven November...       en   
4  1975-12-01                     The United States Constitution       en   

                                         Authors  \
0                   Jefferson, Thomas, 1743-1826   
1                                  United States   
2  Kennedy, John F. (John Fitzgerald), 1917-1963   
3                    Lincoln, Abraham, 1809-1865   
4                                  United States   

                                            Subjects  \
0  United States -- History -- Revolution, 1775-1...   
1  Civil rights -- United States -- Sources; Unit...   
2  United Sta

In [28]:
# Step 4: Remove records with any missing values
df_gutenberg_filtered = df_gutenberg_cleaned.dropna()

# Verify the result
print("\nRecords after removing rows with missing values:")
print(f"Remaining records: {len(df_gutenberg_filtered)}")
print(df_gutenberg_filtered.head())


Records after removing rows with missing values:
Remaining records: 74594
       Issued                                              Title Language  \
0  1971-12-01  The Declaration of Independence of the United ...       en   
1  1972-12-01  The United States Bill of Rights\r\nThe Ten Or...       en   
2  1973-11-01                John F. Kennedy's Inaugural Address       en   
3  1973-11-01  Lincoln's Gettysburg Address\r\nGiven November...       en   
4  1975-12-01                     The United States Constitution       en   

                                         Authors  \
0                   Jefferson, Thomas, 1743-1826   
1                                  United States   
2  Kennedy, John F. (John Fitzgerald), 1917-1963   
3                    Lincoln, Abraham, 1809-1865   
4                                  United States   

                                            Subjects  \
0  United States -- History -- Revolution, 1775-1...   
1  Civil rights -- United States -- S

In [29]:
import pandas as pd
import re

def parse_authors(authors):
    if pd.isna(authors):
        return []
    
    authors_list = authors.split(";")
    parsed_authors = []
    
    for author in authors_list:
        author = author.strip()
        
        # First try to match the pattern with years
        match = re.match(r"([^,]+),\s*([^,\d]+),(?:\s+(\d{4}\??)(?:-(\d{4}\??))?)?$", author)
        
        if match:
            # Extract all parts
            surname, name, birth_year, death_year = match.groups()
            
            # Clean the name parts (remove commas and extra spaces)
            surname = surname.strip().replace(',', '')
            name = name.strip().replace(',', '')
            
            # Format the name with name first, then surname (order swapped here)
            author_name = f"{name} {surname}"  # Now name comes first, surname second
            
            # Clean up any multiple spaces
            author_name = ' '.join(author_name.split())
            
            parsed_authors.append({
                "author_name": author_name,
                "year_of_birth": birth_year if birth_year else None,
                "year_of_death": death_year if death_year else None
            })
        else:
            # If no match, just clean the string of commas and extra spaces
            clean_name = ' '.join(author.replace(',', '').split())
            parsed_authors.append({
                "author_name": clean_name,
                "year_of_birth": None,
                "year_of_death": None
            })
    
    return parsed_authors

# Example usage:
test_authors = "Jefferson, Thomas, 1743-1826; Adams, John, 1735-1826"
result = parse_authors(test_authors)
print("Test result:", result)

# Apply to DataFrame
df_gutenberg_filtered['parsed_authors'] = df_gutenberg_filtered['Authors'].apply(parse_authors)
df_gutenberg_filtered.drop(columns=['Authors'], inplace=True)

print("\nUpdated Gutenberg Dataset with parsed authors:")
print(df_gutenberg_filtered.head())


Test result: [{'author_name': 'Thomas Jefferson', 'year_of_birth': '1743', 'year_of_death': '1826'}, {'author_name': 'John Adams', 'year_of_birth': '1735', 'year_of_death': '1826'}]

Updated Gutenberg Dataset with parsed authors:
       Issued                                              Title Language  \
0  1971-12-01  The Declaration of Independence of the United ...       en   
1  1972-12-01  The United States Bill of Rights\r\nThe Ten Or...       en   
2  1973-11-01                John F. Kennedy's Inaugural Address       en   
3  1973-11-01  Lincoln's Gettysburg Address\r\nGiven November...       en   
4  1975-12-01                     The United States Constitution       en   

                                            Subjects  \
0  United States -- History -- Revolution, 1775-1...   
1  Civil rights -- United States -- Sources; Unit...   
2  United States -- Foreign relations -- 1961-196...   
3  Consecration of cemeteries -- Pennsylvania -- ...   
4  United States -- Politic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered['parsed_authors'] = df_gutenberg_filtered['Authors'].apply(parse_authors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered.drop(columns=['Authors'], inplace=True)


In [30]:
import langcodes

# Function to convert language codes to full names
def convert_language_codes(lang_codes):
    try:
        # Split the language codes and process each one
        languages = [
            langcodes.get(code.strip()).language_name()
            for code in lang_codes.split(";")
        ]
        # Rejoin the converted language names with a delimiter
        return "; ".join(languages)
    except Exception as e:
        # Return the original value if there's any error
        return lang_codes

# Apply the function to the 'Language' column
df_gutenberg_filtered['Language'] = df_gutenberg_filtered['Language'].apply(convert_language_codes)

# Verify the updated 'Language' column
print(df_gutenberg_filtered['Language'].unique())


['English' 'Latin' 'Spanish' 'German; English' 'German; Latin' 'French'
 'Italian' 'English; French' 'Japanese' 'German' 'English; Chinese'
 'Welsh' 'Bulgarian' 'Portuguese' 'Dutch' 'Greek' 'Chinese' 'Hebrew'
 'English; Spanish' 'Russian' 'English; Hungarian' 'English; Korean'
 'Polish' 'Finnish' 'English; Latin' 'English; Esperanto'
 'English; Middle English' 'Esperanto' 'English; Swedish' 'Sanskrit'
 'Danish' 'English; Old English' 'Swedish' 'Welsh; English'
 'English; Aleut' 'Yiddish' 'Lithuanian'
 'Spanish; North American Indian languages' 'Serbian' 'Norwegian'
 'Catalan; German' 'Romanian' 'English; Nahuatl languages'
 'English; Khasi' 'Czech' 'Filipino' 'English; Italian' 'Catalan'
 'Icelandic' 'German; French' 'Mayan languages'
 'French; North American Indian languages' 'Spanish; Filipino' 'Iloko'
 'Interlingua' 'English; Spanish; Filipino' 'Irish' 'Spanish; Iloko'
 'Friulian' 'Afrikaans' 'English; Gamilaraay' 'Middle English' 'Occitan'
 'French; Italian' 'Neapolitan' 'Hungarian

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered['Language'] = df_gutenberg_filtered['Language'].apply(convert_language_codes)


In [31]:
# Step 6: Renaming the "parsed_authors" columns to "authors"
df_gutenberg_filtered.rename(columns={"parsed_authors": "authors"}, inplace=True)
df_gutenberg_filtered.rename(columns={"Title": "title"}, inplace=True)
df_gutenberg_filtered.rename(columns={"Language": "language"}, inplace=True)
df_gutenberg_filtered.rename(columns={"Issued": "issued"}, inplace=True)
df_gutenberg_filtered.rename(columns={"Subjects": "subjects"}, inplace=True)
df_gutenberg_filtered.rename(columns={"Bookshelves": "bookshelves"}, inplace=True)

# Step 7: Reordering the columns
# Specify the desired column order
new_column_order = [
    "title",           # First column
    "authors",
    "issued",
    "language",
    "subjects",
    "bookshelves"
]

# Reorder the DataFrame columns
df_gutenberg_filtered = df_gutenberg_filtered[new_column_order]

# Verify the updated column order
print("Updated DataFrame columns:", df_gutenberg_filtered.columns)


Updated DataFrame columns: Index(['title', 'authors', 'issued', 'language', 'subjects', 'bookshelves'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered.rename(columns={"parsed_authors": "authors"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered.rename(columns={"Title": "title"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gutenberg_filtered.rename(columns={"Language": "language"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

In [32]:
import json

# Step 8: Save the preprocessed DataFrame to JSON
output_file = "datasets/preprocessed_gutenberg.json"

# Save the restored DataFrame to a JSON file (not JSONL)
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(df_gutenberg_filtered.to_dict(orient='records'), f, ensure_ascii=False, indent=4)

print(f"Preprocessed Gutenberg dataset saved to {output_file}")

Preprocessed Gutenberg dataset saved to datasets/preprocessed_gutenberg.json
