In [10]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [11]:
gr_dir = r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/bronze/GOODREADS/'
nyt_dir = r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/bronze/NYT/'
root_dir = r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/'

In [12]:
def normalize_name(name):
    if pd.isna(name):
        return ''
    name = str(name)
    name = name.lower()
    name = ' '.join(name.split())
    name = name.replace('.', '')
    name = ''.join(c for c in name if c.isalnum() or c.isspace())
    return name.strip()

In [13]:
import pandas as pd
from rapidfuzz import process, fuzz

def fuzzy_merge(df1, df2, key1, key2, threshold=90):

    choices = df2[key2].dropna().astype(str).unique()
    choices_list = choices.tolist()

    def find_best_match(name):
        if pd.isna(name):
            return None, 0
        name = str(name)
        match = process.extractOne(name, choices_list, scorer=fuzz.ratio)
        if match and match[1] >= threshold:
            return match[0]  
        return None

    df1 = df1.copy()
    df1['_matched_key'] = df1[key1].apply(lambda x: find_best_match(x))


    df1_matched = df1.dropna(subset=['_matched_key'])

    result = df1_matched.merge(
        df2,
        left_on='_matched_key',
        right_on=key2,
        how='inner',
        suffixes=('_left', '_right')
    )

    result = result.drop(columns=['_matched_key'])
    return result

## Goodreads

**Ref:**
https://cseweb.ucsd.edu/~jmcauley/datasets/goodreads.html#datasets

### goodreads_books

In [14]:
goodreads_books_cols = [
    'book_id',            
    'title',                 
    'author',              
    'publication_year',     
    'average_rating',        
    'ratings_count',        
    'description',          
]
gr_books_df = pd.read_parquet(gr_dir + 'goodreads_books.parquet', 
                              engine='fastparquet', columns=goodreads_books_cols)

In [15]:
print(gr_books_df.columns)
gr_books_df.head()


Index(['book_id', 'title', 'author', 'publication_year', 'average_rating',
       'ratings_count', 'description'],
      dtype='object')


Unnamed: 0,book_id,title,author,publication_year,average_rating,ratings_count,description
0,5333265,W.C. Fields: A Life on Film,Ronald J. Fields,1984.0,4.0,3,
1,1333909,Good Harbor,Anita Diamant,2001.0,3.23,10,"Anita Diamant's international bestseller ""The ..."
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Barbara Hambly,1987.0,4.03,140,Omnibus book club edition containing the Ladie...
3,6066819,Best Friends Forever,Jennifer Weiner,2009.0,3.49,51184,Addie Downs and Valerie Adler were eight when ...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,Nigel Pennick,,3.4,15,


In [16]:
print(gr_books_df.isna().sum(),'\n')
print(gr_books_df.shape,'\n')

book_id               0
title                 0
author              537
publication_year      0
average_rating        0
ratings_count         0
description           0
dtype: int64 

(2360655, 7) 



In [17]:
gr_books_df.dropna(axis=0,how='any',inplace=True)

In [18]:
gr_books_df.dtypes

book_id             object
title               object
author              object
publication_year    object
average_rating      object
ratings_count       object
description         object
dtype: object

In [19]:
gr_books_df['publication_year'] = pd.to_numeric(gr_books_df['publication_year'], errors='coerce').astype('Int64')
gr_books_df['average_rating'] = pd.to_numeric(gr_books_df['average_rating'], errors='coerce').astype('float64')
gr_books_df['ratings_count'] = pd.to_numeric(gr_books_df['ratings_count'], errors='coerce').astype('int64')

### goodreads_reviews

In [20]:
goodreads_reviews_cols = [
    'book_id',              
    'rating',               
    'review_text',           
    'n_votes',              
    'date_added',             
]
gr_reviews_df = pd.read_parquet(gr_dir + 'goodreads_reviews_dedup.parquet',
                                 engine='fastparquet',
                                 columns=goodreads_reviews_cols)

In [21]:
print(gr_reviews_df.columns)
gr_reviews_df.head()

Index(['book_id', 'rating', 'review_text', 'n_votes', 'date_added'], dtype='object')


Unnamed: 0,book_id,rating,review_text,n_votes,date_added
0,24375664,5,Mind blowingly cool. Best science fiction I've...,16,Fri Aug 25 13:55:02 -0700 2017
1,18245960,5,This is a special book. It started slow for ab...,28,Sun Jul 30 07:44:10 -0700 2017
2,6392944,3,I haven't read a fun mystery book in a while a...,6,Mon Jul 24 02:48:17 -0700 2017
3,22078596,4,"Fun, fast paced, and disturbing tale of murder...",22,Mon Jul 24 02:33:09 -0700 2017
4,6644782,4,A fun book that gives you a sense of living in...,8,Mon Jul 24 02:28:14 -0700 2017


In [22]:
print(gr_reviews_df.isna().sum())
print(gr_reviews_df.shape)

book_id        0
rating         0
review_text    0
n_votes        0
date_added     0
dtype: int64
(15739967, 5)


### merge goodread

In [23]:
gr = gr_books_df.merge(gr_reviews_df,on='book_id',how='left')

In [24]:
gr.head()

Unnamed: 0,book_id,title,author,publication_year,average_rating,ratings_count,description,rating,review_text,n_votes,date_added
0,5333265,W.C. Fields: A Life on Film,Ronald J. Fields,1984,4.0,3,,4.0,Loved this book about WC Fields written by his...,1.0,Sun Jun 19 12:09:45 -0700 2016
1,1333909,Good Harbor,Anita Diamant,2001,3.23,10,"Anita Diamant's international bestseller ""The ...",2.0,Not much I can say that many reviewers haven't...,0.0,Wed Apr 04 07:47:34 -0700 2012
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Barbara Hambly,1987,4.03,140,Omnibus book club edition containing the Ladie...,3.0,Competent. I'm sure I read the first half year...,1.0,Fri Dec 18 14:06:12 -0800 2009
3,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Barbara Hambly,1987,4.03,140,Omnibus book club edition containing the Ladie...,4.0,"While these books are excellent overall, I thi...",0.0,Tue Apr 04 07:18:36 -0700 2017
4,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Barbara Hambly,1987,4.03,140,Omnibus book club edition containing the Ladie...,3.0,A collection of two novels about the mercenary...,0.0,Fri Oct 11 10:08:07 -0700 2013


In [25]:
gr.isna().sum()

book_id                   0
title                     0
author                    0
publication_year    2703801
average_rating            0
ratings_count             0
description               0
rating               280209
review_text          280209
n_votes              280209
date_added           280209
dtype: int64

In [26]:
gr.dropna(subset=['rating','publication_year'],axis=0, inplace=True)

In [27]:
gr.isna().sum()

book_id             0
title               0
author              0
publication_year    0
average_rating      0
ratings_count       0
description         0
rating              0
review_text         0
n_votes             0
date_added          0
dtype: int64

In [28]:
gr.shape

(13134415, 11)

## Books sales and ratings

**Ref:** https://www.kaggle.com/datasets/thedevastator/books-sales-and-ratings

### Sale_data

In [29]:
books_sale_cols = [
    'Book Name',             
    'Author',                 
    'genre',                 
    'gross sales',         
    'publisher revenue',    
    'sale price',           
    'sales rank',            
    'units sold',            
    'Author_Rating',             
]
sale_df = pd.read_csv(root_dir + 'DATA/bronze/Books_Sale.csv',usecols=books_sale_cols)

In [30]:
sale_df.head()

Unnamed: 0,Book Name,Author,Author_Rating,genre,gross sales,publisher revenue,sale price,sales rank,units sold
0,Beowulf,"Unknown, Seamus Heaney",Novice,genre fiction,34160.0,20496.0,4.88,1,7000
1,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",Intermediate,genre fiction,12437.5,7462.5,1.99,2,6250
2,Go Set a Watchman,Harper Lee,Novice,genre fiction,47795.0,28677.0,8.69,3,5500
3,When You Are Engulfed in Flames,David Sedaris,Intermediate,fiction,41250.0,24750.0,7.5,3,5500
4,Daughter of Smoke & Bone,Laini Taylor,Intermediate,genre fiction,37952.5,22771.5,7.99,4,4750


In [31]:
print(sale_df[(sale_df['gross sales'] - sale_df['units sold'] < 0) & (sale_df['sale price'] > 1)][['Book Name','gross sales','units sold','sale price']])

                                        Book Name  gross sales  units sold  \
356                                As I Lay Dying       461.68        6264   
357                  Harold and the Purple Crayon       925.68        6264   
358  Lover Eternal (Black Dagger Brotherhood, #2)      2460.15        6237   
359                                           NaN       690.69        6237   
360                                    Fool Moon        687.70        6210   
..                                            ...          ...         ...   
976                The Power of Positive Thinking       325.91        4240   
978                            The House of Mirth       325.91        4240   
979                                  Fairy Tail 1       434.91        4240   
980              The Slow Regard of Silent Things       434.91        4240   
981                          Twelve Years a Slave       543.91        4240   

     sale price  
356        1.99  
357        3.99  
358      

In [32]:
sale_cols = sale_df.columns
s_new_cols = []
for col in sale_cols:
    proper_col = col.title()
    proper_col = proper_col.replace(' ', '_')
    s_new_cols.append(proper_col)

sale_df.columns = s_new_cols

In [33]:
sale_df.dtypes

Book_Name             object
Author                object
Author_Rating         object
Genre                 object
Gross_Sales          float64
Publisher_Revenue    float64
Sale_Price           float64
Sales_Rank             int64
Units_Sold             int64
dtype: object

In [34]:
sale_df.head()

Unnamed: 0,Book_Name,Author,Author_Rating,Genre,Gross_Sales,Publisher_Revenue,Sale_Price,Sales_Rank,Units_Sold
0,Beowulf,"Unknown, Seamus Heaney",Novice,genre fiction,34160.0,20496.0,4.88,1,7000
1,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",Intermediate,genre fiction,12437.5,7462.5,1.99,2,6250
2,Go Set a Watchman,Harper Lee,Novice,genre fiction,47795.0,28677.0,8.69,3,5500
3,When You Are Engulfed in Flames,David Sedaris,Intermediate,fiction,41250.0,24750.0,7.5,3,5500
4,Daughter of Smoke & Bone,Laini Taylor,Intermediate,genre fiction,37952.5,22771.5,7.99,4,4750


In [35]:
sale_df['Book_Name'] = sale_df['Book_Name'].str.title()

## New York Times BestSellers

**Ref:** https://www.kaggle.com/datasets/sujaykapadnis/new-york-times-bestsellers

### nyt_titles

In [36]:
nyt_titles_cols = [           
    'title',           
    'author',         
    'year',           
    'total_weeks',   
    'first_week',      
    'debut_rank',     
    'best_rank'     
]
nyt_titles_df = pd.read_csv(nyt_dir + 'nyt_titles.tsv',sep='\t',usecols=nyt_titles_cols)

In [37]:
nyt_titles_df.head()

Unnamed: 0,title,author,year,total_weeks,first_week,debut_rank,best_rank
0,"""H"" IS FOR HOMICIDE",Sue Grafton,1991,15,1991-05-05,1,2
1,"""I"" IS FOR INNOCENT",Sue Grafton,1992,11,1992-04-26,14,2
2,''G'' IS FOR GUMSHOE,Sue Grafton,1990,6,1990-05-06,4,8
3,A DOG'S JOURNEY,W. Bruce Cameron,2012,1,2012-05-27,3,14
4,CHANGING FACES,Kimberla Lawson Roby,2006,1,2006-02-19,11,14


In [38]:
nyt_titles_df['title']=nyt_titles_df['title'].str.title()

In [39]:
nyt_titles_df.shape

(7431, 7)

### nyt_full

In [40]:
nyt_full_cols = [
    'title',  
    'year',
    'week',
    'rank'       
]

nyt_full_df = pd.read_csv(nyt_dir + 'nyt_full.tsv',sep='\t',usecols=nyt_full_cols)

In [41]:
nyt_full_df.head()

Unnamed: 0,year,week,rank,title
0,1931,1931-10-12,1,THE TEN COMMANDMENTS
1,1931,1931-10-12,2,FINCHE'S FORTUNE
2,1931,1931-10-12,3,THE GOOD EARTH
3,1931,1931-10-12,4,SHADOWS ON THE ROCK
4,1931,1931-10-12,5,SCARMOUCHE THE KING MAKER


In [42]:
nyt_full_df['title'] = nyt_full_df['title'].str.title()

In [43]:
nyt_full_df.head()

Unnamed: 0,year,week,rank,title
0,1931,1931-10-12,1,The Ten Commandments
1,1931,1931-10-12,2,Finche'S Fortune
2,1931,1931-10-12,3,The Good Earth
3,1931,1931-10-12,4,Shadows On The Rock
4,1931,1931-10-12,5,Scarmouche The King Maker


In [44]:
nyt_full_df.shape

(60386, 4)

In [45]:
nyt_df = nyt_titles_df.merge(nyt_full_df, on='title',how='left')

In [46]:
nyt_df.head()

Unnamed: 0,title,author,year_x,total_weeks,first_week,debut_rank,best_rank,year_y,week,rank
0,"""H"" Is For Homicide",Sue Grafton,1991,15,1991-05-05,1,2,1991,1991-05-05,15
1,"""H"" Is For Homicide",Sue Grafton,1991,15,1991-05-05,1,2,1991,1991-05-12,5
2,"""H"" Is For Homicide",Sue Grafton,1991,15,1991-05-05,1,2,1991,1991-05-19,2
3,"""H"" Is For Homicide",Sue Grafton,1991,15,1991-05-05,1,2,1991,1991-05-26,2
4,"""H"" Is For Homicide",Sue Grafton,1991,15,1991-05-05,1,2,1991,1991-06-02,5


In [47]:
nyt_df.drop_duplicates(inplace=True)

In [48]:
nyt_df.drop(columns=['year_y'], inplace=True)
nyt_df.rename(columns={'year_x': 'year'}, inplace=True)

In [49]:
nyt_df.isna().sum()

title           0
author         10
year            0
total_weeks     0
first_week      0
debut_rank      0
best_rank       0
week            0
rank            0
dtype: int64

In [50]:
nyt_df.shape

(64254, 9)

In [51]:
df = fuzzy_merge(nyt_df,sale_df,'title','Book_Name',threshold=85)
df.head()

Unnamed: 0,title,author,year,total_weeks,first_week,debut_rank,best_rank,week,rank,Book_Name,Author,Author_Rating,Genre,Gross_Sales,Publisher_Revenue,Sale_Price,Sales_Rank,Units_Sold
0,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Chasing Harry Winston,Lauren Weisberger,Novice,genre fiction,558.88,335.328,4.99,1042,4440
1,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-22,6,Chasing Harry Winston,Lauren Weisberger,Novice,genre fiction,558.88,335.328,4.99,1042,4440
2,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-29,6,Chasing Harry Winston,Lauren Weisberger,Novice,genre fiction,558.88,335.328,4.99,1042,4440
3,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-07-06,4,Chasing Harry Winston,Lauren Weisberger,Novice,genre fiction,558.88,335.328,4.99,1042,4440
4,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-07-13,8,Chasing Harry Winston,Lauren Weisberger,Novice,genre fiction,558.88,335.328,4.99,1042,4440


In [52]:
print(df.shape,'\n')
print(df.columns,'\n')


(5211, 18) 

Index(['title', 'author', 'year', 'total_weeks', 'first_week', 'debut_rank',
       'best_rank', 'week', 'rank', 'Book_Name', 'Author', 'Author_Rating',
       'Genre', 'Gross_Sales', 'Publisher_Revenue', 'Sale_Price', 'Sales_Rank',
       'Units_Sold'],
      dtype='object') 



In [53]:
df.drop(columns=['Book_Name'], inplace=True)
df.rename(columns={'title_left':'title'}, inplace=True)

In [54]:
df.isna().sum()

title                0
author               0
year                 0
total_weeks          0
first_week           0
debut_rank           0
best_rank            0
week                 0
rank                 0
Author               0
Author_Rating        0
Genre                0
Gross_Sales          0
Publisher_Revenue    0
Sale_Price           0
Sales_Rank           0
Units_Sold           0
dtype: int64

In [55]:
df = fuzzy_merge(df,gr,'title','title',threshold=85)

In [56]:
print(df.shape,'\n')
print(df.columns,'\n')

(2133244, 28) 

Index(['title_left', 'author_left', 'year', 'total_weeks', 'first_week',
       'debut_rank', 'best_rank', 'week', 'rank', 'Author', 'Author_Rating',
       'Genre', 'Gross_Sales', 'Publisher_Revenue', 'Sale_Price', 'Sales_Rank',
       'Units_Sold', 'book_id', 'title_right', 'author_right',
       'publication_year', 'average_rating', 'ratings_count', 'description',
       'rating', 'review_text', 'n_votes', 'date_added'],
      dtype='object') 



In [57]:
df.drop(columns=['title_right', 'author_right','Author','book_id'], inplace=True)
df.rename(columns={'title_left':'title','author_left':'author'}, inplace=True)

In [58]:
df.isna().sum()

title                0
author               0
year                 0
total_weeks          0
first_week           0
debut_rank           0
best_rank            0
week                 0
rank                 0
Author_Rating        0
Genre                0
Gross_Sales          0
Publisher_Revenue    0
Sale_Price           0
Sales_Rank           0
Units_Sold           0
publication_year     0
average_rating       0
ratings_count        0
description          0
rating               0
review_text          0
n_votes              0
date_added           0
dtype: int64

In [59]:
df.columns

Index(['title', 'author', 'year', 'total_weeks', 'first_week', 'debut_rank',
       'best_rank', 'week', 'rank', 'Author_Rating', 'Genre', 'Gross_Sales',
       'Publisher_Revenue', 'Sale_Price', 'Sales_Rank', 'Units_Sold',
       'publication_year', 'average_rating', 'ratings_count', 'description',
       'rating', 'review_text', 'n_votes', 'date_added'],
      dtype='object')

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2133244 entries, 0 to 2133243
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   title              object 
 1   author             object 
 2   year               int64  
 3   total_weeks        int64  
 4   first_week         object 
 5   debut_rank         int64  
 6   best_rank          int64  
 7   week               object 
 8   rank               int64  
 9   Author_Rating      object 
 10  Genre              object 
 11  Gross_Sales        float64
 12  Publisher_Revenue  float64
 13  Sale_Price         float64
 14  Sales_Rank         int64  
 15  Units_Sold         int64  
 16  publication_year   Int64  
 17  average_rating     float64
 18  ratings_count      int64  
 19  description        object 
 20  rating             float64
 21  review_text        object 
 22  n_votes            float64
 23  date_added         object 
dtypes: Int64(1), float64(6), int64(8), object(9)
memor

In [61]:
df['first_week'] = pd.to_datetime(df['first_week'], errors='coerce')
df['week'] = pd.to_datetime(df['week'], errors='coerce')
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')


In [62]:
df.head(10)

Unnamed: 0,title,author,year,total_weeks,first_week,debut_rank,best_rank,week,rank,Author_Rating,...,Sales_Rank,Units_Sold,publication_year,average_rating,ratings_count,description,rating,review_text,n_votes,date_added
0,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",5.0,Great easy read! I loved how it referenced pop...,0.0,2008-10-11 17:42:40-07:00
1,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",3.0,"Fun read (or rather, listen). I really liked t...",0.0,2010-01-26 23:48:19-08:00
2,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",3.0,o.k.,0.0,2009-07-21 23:35:10-07:00
3,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",3.0,Leigh and Emmy were pretty boring all by thems...,0.0,2009-04-15 15:12:10-07:00
4,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",3.0,"Better than Everyone Worth Knowing, but still ...",0.0,2009-05-18 20:22:57-07:00
5,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,56,"Meet Emmy, Leigh, and Adriana. Best friends si...",1.0,"I agree with all the other reviewers, this boo...",1.0,2013-02-24 15:42:58-08:00
6,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,64,"The bestselling author of ""The Devil Wears Pra...",5.0,I give this book a 5 star! This is amazing! I ...,0.0,2012-12-30 10:57:14-08:00
7,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,64,"The bestselling author of ""The Devil Wears Pra...",3.0,This is a great 'summer' read book. I thought ...,0.0,2011-04-25 23:16:05-07:00
8,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,64,"The bestselling author of ""The Devil Wears Pra...",4.0,"I was looking for something girly, funny and w...",0.0,2008-05-07 09:18:53-07:00
9,Chasing Harry Winston,Lauren Weisberger,2008,11,2008-06-15,5,4,2008-06-15,6,Novice,...,1042,4440,2008,3.32,64,"The bestselling author of ""The Devil Wears Pra...",0.0,i left this book in taiwan... but for some rea...,0.0,2008-10-02 07:04:43-07:00


In [63]:
print(df[(df['Gross_Sales'] - df['Units_Sold'] < 0) & (df['Sale_Price'] > 1)][['title', 'Units_Sold', 'Gross_Sales', 'Sale_Price']])

                         title  Units_Sold  Gross_Sales  Sale_Price
0        Chasing Harry Winston        4440       558.88        4.99
1        Chasing Harry Winston        4440       558.88        4.99
2        Chasing Harry Winston        4440       558.88        4.99
3        Chasing Harry Winston        4440       558.88        4.99
4        Chasing Harry Winston        4440       558.88        4.99
...                        ...         ...          ...         ...
2133239        Change Of Heart        3915       990.35        6.83
2133240        Change Of Heart        3915       990.35        6.83
2133241        Change Of Heart        3915       990.35        6.83
2133242        Change Of Heart        3915       990.35        6.83
2133243        Change Of Heart        3915       990.35        6.83

[707082 rows x 4 columns]


In [64]:
df.shape

(2133244, 24)

In [None]:
df.to_csv(r'/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/silver/silver_1.csv', index = False)