In [25]:
import pyspark
import pandas as pd
import wikipedia
from pyspark.sql import SparkSession

In [26]:
spark = (
    SparkSession
    .builder
    .appName("BookMetaLoader")
    .master("local[1]")               # only one core/thread
    .config("spark.ui.enabled", "false")  
    .getOrCreate()
)

In [27]:
pd.read_csv(r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv")

Unnamed: 0,isbn,title,authors,description,subjects,text
0,0140244824,Songs in ordinary time,Mary McGarry Morris,,"Fiction, Divorced women, Problem families, Rom...",Songs in ordinary time Mary McGarry Morris Fic...
1,014025448X,At home in Mitford,Jan Karon,,"Fiction, Mitford (N.C. : Imaginary place), Epi...","At home in Mitford Jan Karon Fiction, Mitford ..."
2,0307030504,Bunnies' ABC (Little Golden Books),Garth Williams,,"Alphabet, Juvenile literature, English language",Bunnies' ABC (Little Golden Books) Garth Willi...
3,0307125165,My Book of Opposites (A little nugget book),Golden Books,,"Juvenile literature, English language, Synonym...",My Book of Opposites (A little nugget book) Go...
4,0345313860,The Vampire Lestat,Anne Rice,,"nobility, vampire novels, Fiction, Vampires, R...","The Vampire Lestat Anne Rice nobility, vampire..."
...,...,...,...,...,...,...
495,0829413502,The Seeker's Guide to Saints (Seeker Series (C...,Mitch Finley,,"Doctrines, Christian saints, Cult, Doctrinal T...",The Seeker's Guide to Saints (Seeker Series (C...
496,1560255285,Dark harbor,Ved Mehta,,"East Indian Americans, Social life and customs...","Dark harbor Ved Mehta East Indian Americans, S..."
497,0671836129,Glory Game,Janet Dailey,,,Glory Game Janet Dailey
498,0060080817,After,Francine Prose,,"Fiction, Conspiracies, School shootings, High ...","After Francine Prose Fiction, Conspiracies, Sc..."


In [28]:
from pyspark.sql import SparkSession

In [29]:
spark = (
    SparkSession
    .builder
    .appName("BookMetaLoader")
    .master("local[*]")   # use as many threads as you’ve got cores
    .getOrCreate()
)

In [30]:
print(type(spark))


<class 'pyspark.sql.session.SparkSession'>


In [31]:
from pyspark.sql import SparkSession
df_pyspark = (
    spark.read
         .option("header", "true")      # if your file has a header row
         .option("inferSchema", "true") # to auto-detect column types
         .csv(r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv")
)


In [32]:
df_pyspark.show(5)
df_pyspark

+----------+--------------------+-------------------+-----------+--------------------+--------------------+
|      isbn|               title|            authors|description|            subjects|                text|
+----------+--------------------+-------------------+-----------+--------------------+--------------------+
|0140244824|Songs in ordinary...|Mary McGarry Morris|       NULL|Fiction, Divorced...|Songs in ordinary...|
|014025448X|  At home in Mitford|          Jan Karon|       NULL|Fiction, Mitford ...|At home in Mitfor...|
|0307030504|Bunnies' ABC (Lit...|     Garth Williams|       NULL|Alphabet, Juvenil...|Bunnies' ABC (Lit...|
|0307125165|My Book of Opposi...|       Golden Books|       NULL|Juvenile literatu...|My Book of Opposi...|
|0345313860|  The Vampire Lestat|          Anne Rice|       NULL|nobility, vampire...|The Vampire Lesta...|
+----------+--------------------+-------------------+-----------+--------------------+--------------------+
only showing top 5 rows


DataFrame[isbn: string, title: string, authors: string, description: string, subjects: string, text: string]

In [33]:
type(df_pyspark)
df_pyspark.printSchema()

root
 |-- isbn: string (nullable = true)
 |-- title: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- description: string (nullable = true)
 |-- subjects: string (nullable = true)
 |-- text: string (nullable = true)



In [36]:
import pandas as pd
import requests
import time

# --- 0) PARAMETERS ---
INPUT_CSV    = r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv"
OUTPUT_CSV = r"C:\CUNY_MSDS\DATA612\PROJECT5\enriched_books_with_wiki.csv"
N_SENTENCES  = 3      # how many sentences in your summary
DELAY_SEC    = 0.1    # throttle between requests

# --- 1) Load your original data in pandas ---
orig_df = pd.read_csv(INPUT_CSV)

# --- 2) Define a helper that returns an N-sentence summary ---
def get_book_summary(title, sentences=N_SENTENCES):
    URL = "https://en.wikipedia.org/w/api.php"
    params = {
        "action":       "query",
        "prop":         "extracts",
        "exintro":      True,
        "explaintext":  True,
        "exsentences":  sentences,
        "redirects":    1,
        "titles":       title,
        "format":       "json",
    }
    try:
        r = requests.get(URL, params=params, timeout=5)
        r.raise_for_status()
        pages = r.json()["query"]["pages"]
        return next(iter(pages.values())).get("extract", "")
    except Exception:
        return ""

# --- 3) Build summaries for each title ---
summaries = []
for title in orig_df["title"]:
    summaries.append(get_book_summary(title))
    time.sleep(DELAY_SEC)  

# --- 4) Merge back to your DataFrame ---
orig_df["wiki_description"] = summaries

# --- 5) Quick peek ---
print(orig_df[["title","wiki_description"]].head())

# --- 6) Write out one CSV ---
orig_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"✅ Written enriched file to {OUTPUT_CSV}")


                                         title  \
0                       Songs in ordinary time   
1                           At home in Mitford   
2           Bunnies' ABC (Little Golden Books)   
3  My Book of Opposites (A little nugget book)   
4                           The Vampire Lestat   

                                    wiki_description  
0                                                     
1                                                     
2                                                     
3                                                     
4  The Vampire Lestat (1985) is a vampire novel b...  
✅ Written enriched file to C:\CUNY_MSDS\DATA612\PROJECT5\enriched_books_with_wiki.csv


In [37]:
import requests
import pandas as pd
import time

# Load your CSV (pandas makes this simple)
df = pd.read_csv(r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv")

def fetch_google_books_description(isbn, api_key=None):
    """
    Returns the 'description' field from Google Books for a given ISBN,
    or an empty string if not found.
    """
    base = "https://www.googleapis.com/books/v1/volumes"
    params = {"q": f"isbn:{isbn}"}
    if api_key:
        params["key"] = api_key
    r = requests.get(base, params=params, timeout=5)
    r.raise_for_status()
    data = r.json()
    items = data.get("items")
    if not items:
        return ""
    # Usually the first item is the match
    info = items[0].get("volumeInfo", {})
    return info.get("description", "")

# Test on the first five ISBNs
for isbn in df["isbn"].head():
    desc = fetch_google_books_description(isbn)
    print(f"{isbn} → {desc[:200]!r}…\n")

# Now batch-fetch for all (throttled so you don’t get rate-limited)
descs = []
for isbn in df["isbn"]:
    try:
        descs.append(fetch_google_books_description(isbn))
    except Exception:
        descs.append("")
    time.sleep(0.1)  # 10 requests/sec

# Add to your DataFrame and write out
df["google_description"] = descs
df.to_csv(
    r"C:\Users\farho\Downloads\books_with_google_blurbs.csv",
    index=False,
    encoding="utf-8"
)
print("✅ Done!  Check books_with_google_blurbs.csv")


0140244824 → "It's the summer of 1960 in Atkinson, Vermont. Maria Fermoyle is a strong but vulnerable divorced woman whose loneliness and ambition for her children make her easy prey for dangerous con man Omar Duva"…

014025448X → "The first novel in #1 New York Times bestselling author Jan Karon’s beloved series set in America’s favorite small town: Mitford. It's easy to feel at home in Mitford. In these high, green hills, the "…

0307030504 → 'Bunnies learn the alphabet.'…

0307125165 → ''…

0345313860 → "#1 New York Times Bestselling author - Surrender to fiction's greatest creature of the night - Book II of the Vampire Chronicles The vampire hero of Anne Rice’s enthralling novel is a creature of the "…

✅ Done!  Check books_with_google_blurbs.csv


In [38]:
import requests
import pandas as pd
import time

# 1) Load your CSV of books
df = pd.read_csv(r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv")

# 2) Define a fetcher using the Open Library “data” endpoint
def fetch_openlibrary_description(isbn):
    """
    Returns the 'description' or first 'excerpt' from Open Library for a given ISBN,
    or empty string if none found.
    """
    url = "https://openlibrary.org/api/books"
    params = {
        "bibkeys": f"ISBN:{isbn}",
        "format": "json",
        "jscmd": "data"
    }
    try:
        resp = requests.get(url, params=params, timeout=5)
        resp.raise_for_status()
        data = resp.json()
        key = f"ISBN:{isbn}"
        info = data.get(key, {})
        # Try the ‘description’ field (could be string or dict)
        desc = info.get("description")
        if isinstance(desc, dict):
            desc = desc.get("value","")
        if isinstance(desc, str) and desc.strip():
            return desc.strip()
        # Fallback: take the first excerpt, if any
        excerpts = info.get("excerpts") or info.get("excerpt")
        if isinstance(excerpts, list) and excerpts:
            first = excerpts[0]
            return first.get("text","").strip()
        if isinstance(excerpts, dict):
            return excerpts.get("text","").strip()
        return ""
    except Exception:
        return ""

# 3) Test on a handful
for isbn in df["isbn"].head():
    print(isbn, "→", fetch_openlibrary_description(isbn)[:200], "…\n")

# 4) Batch‐fetch for all books (with polite throttling)
descs = []
for isbn in df["isbn"]:
    descs.append(fetch_openlibrary_description(isbn))
    time.sleep(0.1)

# 5) Add to your DataFrame & save
df["openlib_description"] = descs
output_path = r"C:\Users\farho\Downloads\books_with_openlib_blurbs.csv"
df.to_csv(output_path, index=False, encoding="utf-8")
print("✅ Done – saved to", output_path)


0140244824 →  …

014025448X →  …

0307030504 →  …

0307125165 →  …

0345313860 → I am the vampire Lestat. …

✅ Done – saved to C:\Users\farho\Downloads\books_with_openlib_blurbs.csv


In [39]:
import requests
import pandas as pd
import time

# 1) Config
NYT_KEY     = "lucRAUjAeldYWSt6n8MQv9cGvbhwqSqr"
INPUT_CSV   = r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv"
OUTPUT_CSV  = r"C:\Users\farho\Downloads\books_with_nyt_reviews.csv"
DELAY_SEC   = 0.1   # polite throttle

# 2) Load your metadata
df = pd.read_csv(INPUT_CSV)

# 3) Fetch the first NYT review summary (if any) for an ISBN
def fetch_nyt_review(isbn, api_key=NYT_KEY):
    url    = "https://api.nytimes.com/svc/books/v3/reviews.json"
    params = {"isbn": isbn, "api-key": api_key}
    try:
        r = requests.get(url, params=params, timeout=5)
        r.raise_for_status()
        results = r.json().get("results", [])
        if not results:
            return ""
        # Take the first review’s summary
        return results[0].get("summary", "")
    except Exception:
        return ""

# 4) Batch‐fetch for all books
nyt_summaries = []
for isbn in df["isbn"]:
    nyt_summaries.append(fetch_nyt_review(isbn))
    time.sleep(DELAY_SEC)

# 5) Attach and save
df["nyt_review_summary"] = nyt_summaries
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"✅ Written NYT summaries to {OUTPUT_CSV}")


✅ Written NYT summaries to C:\Users\farho\Downloads\books_with_nyt_reviews.csv


In [None]:
print(results)

[('Songs in ordinary time', ''), ('At home in Mitford', ''), ("Bunnies' ABC (Little Golden Books)", ''), ('My Book of Opposites (A little nugget book)', ''), ('The Vampire Lestat', "The Vampire Lestat (1985) is a vampire novel by American writer Anne Rice, the second in her Vampire Chronicles, following Interview with the Vampire (1976). The story is told from the point of view of the vampire Lestat de Lioncourt, while Interview is narrated by Louis de Pointe du Lac.\nThe novel follows Lestat's rise to fame as a 1980's rockstar, his early experiences as a vampire, and his search for meaning and connection in his vampiric existence. It also offers some of Lestat's perspective on the events of Interview with the Vampire, which has been published as an in-universe novel by the journalist, Daniel Molloy.\n\n"), ('A prayer for Owen Meany', ''), ('Rising sun', ''), ('Magic Carpets', ''), ('A Prince Among Men', 'A Prince Among Men is a British sitcom that ran on BBC1 from 15 September 1997 to

In [None]:
import pandas as pd

# A) Read your original CSV into pandas
orig_df = pd.read_csv(r"C:\Users\farho\Downloads\selected_books_with_meta (1).csv")

# B) Build a pandas DataFrame of your scraped results
#    (assuming `results` is your list of (title, wiki_description) tuples)
scrape_df = pd.DataFrame(results, columns=["title","wiki_description"])

# C) Merge on title
merged = orig_df.merge(scrape_df, on="title", how="left")

# D) Save to a single CSV
output_path = r"C:\Users\farho\Downloads\enriched_books_with_wiki.csv"
merged.to_csv(output_path, index=False, encoding="utf-8")

print(f"✅ Written merged file to {output_path}")


✅ Written merged file to C:\Users\farho\Downloads\enriched_books_with_wiki.csv
