Remember to get right in this project:
- Folder structure
- Docstrings
- PEP8
- Exception handling

In [3]:
# !python3 -m venv venv

In [4]:
# !pip install pandas
# !pip install sqlalchemy
# !pip install ipython-sql
# !pip install python-dotenv
# !pip install psycopg2



In [5]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv



load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

%load_ext sql

In [6]:
%sql postgresql://postgres:password@localhost/books

In [7]:
import pandas as pd
df = pd.read_csv('/Users/bfaris96/Desktop/turing-proj/books_db/data/books_1.Best_Books_Ever.csv')

In [8]:
df.shape

(52478, 25)

In [9]:
df = df.drop_duplicates()

In [10]:
df.shape

(52428, 25)

In [11]:
df.columns

Index(['bookId', 'title', 'series', 'author', 'rating', 'description',
       'language', 'isbn', 'genres', 'characters', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'firstPublishDate', 'awards',
       'numRatings', 'ratingsByStars', 'likedPercent', 'setting', 'coverImg',
       'bbeScore', 'bbeVotes', 'price'],
      dtype='object')

In [12]:
duplicated_rows = df[df.duplicated(subset='isbn', keep=False)]


In [13]:
df = df.drop_duplicates(subset=['bookId'])

In [14]:
duplicate_isbn = df[df.duplicated(subset='isbn', keep=False)]
duplicate_isbn = duplicate_isbn[duplicate_isbn['isbn'] != '9999999999999']
duplicate_isbn

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
342,7235533-the-way-of-kings,The Way of Kings,The Stormlight Archive #1,Brandon Sanderson (Goodreads Author),4.63,From #1 New York Times bestselling author Bran...,English,9780765326355,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...","['Kaladin Stormblessed', 'Dalinar Kholin', 'Sh...",...,,['Locus Award Nominee for Best Fantasy Novel (...,302877,"['221465', '60345', '14100', '4101', '2866']",98.0,['Roshar'],https://i.gr-assets.com/images/S/compressed.ph...,34012,359,22.74
1296,7826803-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads Author),3.88,England in the 1520s is a heartbeat from disas...,English,9780312429980,"['Historical Fiction', 'Fiction', 'Historical'...","['Anne Boleyn', 'Thomas More', 'Catherine of A...",...,04/30/09,"['Booker Prize (2009)', 'Orange Prize Nominee ...",166123,"['62692', '51592', '29341', '13493', '9005']",86.0,"['Putney (United Kingdom)', 'Dover, Kent, Engl...",https://i.gr-assets.com/images/S/compressed.ph...,4109,57,1.5
23159,34704992-edgedancer,Edgedancer,The Stormlight Archive #2.5,Brandon Sanderson (Goodreads Author),4.29,\r\n From #1 New York Times bestselling autho...,,9781250166548,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",['Lift'],...,11/22/16,[],39199,"['17521', '16063', '5009', '535', '71']",98.0,[],https://i.gr-assets.com/images/S/compressed.ph...,99,1,9.81
25628,19380923-wolf-hall,Wolf Hall,Thomas Cromwell #1,Hilary Mantel (Goodreads Author),3.88,"Tudor England. Henry VIII is on the throne, bu...",English,9780312429980,"['Historical Fiction', 'Fiction', 'Historical'...","['Anne Boleyn', 'Thomas More', 'Catherine of A...",...,04/30/09,"['Booker Prize (2009)', 'Orange Prize Nominee ...",166308,"['62767', '51656', '29360', '13511', '9014']",86.0,"['Putney (United Kingdom)', 'Dover, Kent, Engl...",https://i.gr-assets.com/images/S/compressed.ph...,98,1,6.91
27486,34703445-edgedancer,Edgedancer,The Stormlight Archive #2.5,Brandon Sanderson (Goodreads Author),4.29,From #1 New York Times bestselling author Bran...,,9781250166548,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",['Lift'],...,11/22/16,[],39206,"['17526', '16065', '5009', '535', '71']",98.0,[],https://i.gr-assets.com/images/S/compressed.ph...,97,1,9.81
32761,9188338-the-way-of-kings,The Way of Kings,The Stormlight Archive #1,Brandon Sanderson (Goodreads Author),4.63,From #1 New York Times bestselling author Bran...,English,9780765326355,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...","['Kaladin Stormblessed', 'Dalinar Kholin', 'Sh...",...,,['Locus Award Nominee for Best Fantasy Novel (...,304529,"['222482', '60729', '14210', '4177', '2931']",98.0,['Roshar'],https://i.gr-assets.com/images/S/compressed.ph...,93,1,22.47


In [15]:
# Create a DataFrame with all rows where 'isbn' is '9999999999999'
df1 = df[df['isbn'] == '9999999999999']

# Create another DataFrame with all other rows and drop duplicates in 'isbn'
df2 = df[df['isbn'] != '9999999999999'].drop_duplicates(subset='isbn')

# Concatenate the two DataFrames
df = pd.concat([df1, df2])

Increasing num of columns pd will show and column width:

In [16]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 250)

Forcing lowercase and removing all new lines (\n) from all rows

In [17]:
df = df.applymap(lambda r: r.lower() if type(r) == str else r)
df = df.applymap(lambda r: r.strip() if type(r) == str else r)
df = df.applymap(lambda r: r.replace('\n', ' ') if type(r) == str else r)

In [20]:
df[df['title'] == '']

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,bookFormat,edition,pages,publisher,publishDate,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price


We have to do some fancy fuckin coding here. We have to split the authors, keep any name that doesn't have (Something) after it as author, unless it is (GoodRead Author) and split any second author into an author2, author3, ... column, and then scan if there is another John Doe (Something), then add the Something colum to the df and append John Doe into that column

In [19]:
# def author_splitter(df):
#     for ['authors']
#     if authors is None:
#         return None
#     else:
#         auth_list = authors.split(',')
#         for 


#         return auth_list[0]
    