In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [2]:
# DIVIDIMOS EL DATASET EN TRAIN Y TEST
df_datos = pd.read_csv('../data/datos.csv')
train, test = train_test_split(df_datos, test_size=0.2, random_state=42)

In [3]:
# ELIMINAMOS REVIEWER_RATING DE TEST
test = test.drop(columns=['reviewer_rating'])

In [4]:
train.head()

Unnamed: 0,rank,book_title,book_price,rating,author,year_of_publication,genre,url,review_title,reviewer,reviewer_rating,review_description,is_verified,date,timestamp,asin
708,81,Just Because,12.69,4.6,Matthew McConaughey,2023,"Picture Books, Childrens, Fiction",amazon.com/Just-Because-Matthew-McConaughey/dp...,A great gift to live by,A. Slater,5,Bought copies for my inherited grandsons and a...,True,16-10-2023,"Reviewed in the United States October 16, 2023",593622030
239,27,"Goodnight, Goodnight Construction Site (Board ...",3.78,4.9,Sherri Duskey Rinker,2011,"Picture Book, Cars and Trucks",amazon.com/Goodnight-Construction-Sherri-Duske...,Book,Margaret Zahalka,5,It was a gift and he does read this every nigh...,True,16-10-2023,"Reviewed in the United States October 16, 2023",1452111731
381,44,Goodnight Moon,5.36,4.9,Margaret Wise Brown,1947,"Childrens, literature",amazon.com/Goodnight-Moon-Margaret-Wise-Brown/...,Adorable Bedtime Story,Robin M.,5,"""Goodnight Moon"" is a classic bedtime story fo...",True,05-10-2023,"Reviewed in the United States October 5, 2023",694003611
792,90,Harry Potter and the Prisoner of Azkaban (Harr...,25.97,4.9,J.K. Rowling,1999,Fantasy,amazon.com/Harry-Potter-Prisoner-Azkaban-MinaL...,Absolutely STUNNING,Mom's opinion,5,Minalima's interactive papercraft is always be...,True,07-10-2023,"Reviewed in the United States October 7, 2023",1338815288
672,78,Israel: A Simple Guide to the Most Misundersto...,15.59,4.7,Noa Tishby,2021,"Nonfiction, History, Politics, Israel, Jewish,...",amazon.com/Israel-Simple-Guide-Misunderstood-C...,An easy and fun to read history of ISRAEL,WALTER P. LESER,5,Today it’s in the news and it’s worth finding ...,True,04-11-2023,"Reviewed in the United States November 4, 2023",1982144947


In [5]:
test.head()

Unnamed: 0,rank,book_title,book_price,rating,author,year_of_publication,genre,url,review_title,reviewer,review_description,is_verified,date,timestamp,asin
280,34,How to Know a Person: The Art of Seeing Others...,19.89,4.5,Alice Walstead,2022,"Thriller, mystery, suspense",amazon.com/How-Know-Person-Seeing-Others/dp/05...,An Important Book on an Important Topic Taking...,J.M. Ryan,David Brooks is good at both reporting and syn...,True,30-10-2023,"Reviewed in the United States October 30, 2023",059323006X
434,50,How to Catch a Dinosaur,4.66,4.8,Adam Wallace,2019,"Picture Books, Dinosaurs, Childrens, Fiction, ...",amazon.com/How-Catch-Dinosaur-Adam-Wallace/dp/...,"A Brilliant, Kids Story of Building Courage",JoeSanch,What a great! I love the storyline and emphasi...,True,10-10-2023,"Reviewed in the United States October 10, 2023",1492680524
39,6,"Fourth Wing (The Empyrean, 1)",16.99,4.8,Rebecca Yarros,2023,Fantasy,amazon.com/Fourth-Wing-Empyrean-Rebecca-Yarros...,Lords of Discipline + Dragonriders of Pern + ....,Greg Barlin,I first came across Fourth Wing when I noticed...,True,17-07-2023,"Reviewed in the United States July 17, 2023",1649374046
417,48,Where's Bluey?: A Search-and-Find Book,6.93,4.8,Penguin Young Readers Licenses (,2022,"Childrens, Storytime,Fiction",amazon.com/Wheres-Bluey-Search-Find-Book/dp/05...,Cute book,"Mele , Thank you , so soft nice material great...",Present for my grand babyThank u,True,24-10-2023,"Reviewed in the United States October 24, 2023",593385691
585,69,"Hello, Baby Animals: A Durable High-Contrast B...",4.73,4.9,"duopress labs, Julissa Mora",2016,"Childrens, Picture Books, Animals",amazon.com/Hello-Baby-Animals-High-Contrast-Bo...,So cute..,Latrice,"My baby loves this little book, I read it to h...",True,11-10-2023,"Reviewed in the United States October 11, 2023",1938093682


In [6]:
# PROCESAMIENTO DE TRAIN
train = train.drop(columns=['rank', 'url', 'timestamp', 'asin', 'author', 'year_of_publication', 'date', 'asin'])

# SEPARAMOS LOS GÉNEROS
train['genre'] = train['genre'].str.split(',')
train = train.explode('genre')
train = pd.get_dummies(train, columns=['genre'], drop_first=False)

# IS_VERIFIED A BINARIO
train['is_verified'] = train['is_verified'].apply(lambda x: 1 if x else 0)

# ESCALADO VARIABLES NUMÉRICAS
scaler = StandardScaler()
train[['book_price', 'reviewer_rating', 'rating']] = scaler.fit_transform(train[['book_price', 'reviewer_rating', 'rating']])
train.head()

Unnamed: 0,book_title,book_price,rating,review_title,reviewer,reviewer_rating,review_description,is_verified,genre_ Action,genre_ Activities,...,genre_Nonfiction,genre_Personal Finance,genre_Picture Book,genre_Picture Books,genre_Romance,genre_Self Help,genre_Self-improvement,genre_Spiritual Warfare,genre_Spirituality,genre_Thriller
708,Just Because,0.071516,-0.554055,A great gift to live by,A. Slater,0.436617,Bought copies for my inherited grandsons and a...,1,False,False,...,False,False,False,True,False,False,False,False,False,False
708,Just Because,0.071516,-0.554055,A great gift to live by,A. Slater,0.436617,Bought copies for my inherited grandsons and a...,1,False,False,...,False,False,False,False,False,False,False,False,False,False
708,Just Because,0.071516,-0.554055,A great gift to live by,A. Slater,0.436617,Bought copies for my inherited grandsons and a...,1,False,False,...,False,False,False,False,False,False,False,False,False,False
239,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Book,Margaret Zahalka,0.436617,It was a gift and he does read this every nigh...,1,False,False,...,False,False,True,False,False,False,False,False,False,False
239,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Book,Margaret Zahalka,0.436617,It was a gift and he does read this every nigh...,1,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# PROCESAMIENTO DE LAS REVIEWS DE LOS USUARIOS
tfidf_vectorizer_title = TfidfVectorizer(max_features=100)
tfidf_vectorizer_description = TfidfVectorizer(max_features=100)

# APLICAR TF-IDF
tfidf_title = tfidf_vectorizer_title.fit_transform(train['review_title'].fillna(''))
tfidf_description = tfidf_vectorizer_description.fit_transform(train['review_description'].fillna(''))

# CONVERTIR TF-IDF EN DATAFRAMES
tfidf_title_df = pd.DataFrame(tfidf_title.toarray(), columns=tfidf_vectorizer_title.get_feature_names_out(), index=train.index)
tfidf_description_df = pd.DataFrame(tfidf_description.toarray(), columns=tfidf_vectorizer_description.get_feature_names_out(), index=train.index)

# CONTATENARLOS
train = pd.concat([train, tfidf_title_df, tfidf_description_df], axis=1)

# ELIMINAR ORIGINALES
train.drop(columns=['review_title', 'review_description'], inplace=True)

train.head()

Unnamed: 0,book_title,book_price,rating,reviewer,reviewer_rating,is_verified,genre_ Action,genre_ Activities,genre_ Adult,genre_ Adventure,...,were,what,when,which,who,will,with,would,you,your
708,Just Because,0.071516,-0.554055,A. Slater,0.436617,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
708,Just Because,0.071516,-0.554055,A. Slater,0.436617,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
708,Just Because,0.071516,-0.554055,A. Slater,0.436617,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.222732,0.0,0.0,0.0,0.0
239,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Margaret Zahalka,0.436617,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,"Goodnight, Goodnight Construction Site (Board ...",-1.13658,1.358517,Margaret Zahalka,0.436617,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# PROCESAMIENTO DE TEST
test = test.drop(columns=['rank', 'url', 'timestamp', 'asin', 'author', 'year_of_publication', 'date', 'asin'])

# SEPARAMOS LOS GÉNEROS
test['genre'] = test['genre'].str.split(',')
test = test.explode('genre')
test = pd.get_dummies(test, columns=['genre'], drop_first=False)

# NOS ASEGURAMOS DE QUE TEST TENGA LOS MISMOS GÉNEROS QUE TRAIN
missing_genres = set(train.columns) - set(test.columns)
for col in missing_genres:
    if 'genre_' in col:  # Agregar columnas de género faltantes
        test[col] = 0

# IS_VERIFIED A BINARIO
test['is_verified'] = test['is_verified'].apply(lambda x: 1 if x else 0)

# ESCALADO DE VARIABLES NUMÉRICAS 
scaler_test = StandardScaler()
scaler_test.fit(train[['book_price', 'rating']])
test[['book_price', 'rating']] = scaler_test.transform(test[['book_price', 'rating']])

# PROCESAMIENTO DE LAS REVIEWS DE LOS USUARIOS
tfidf_title_test = tfidf_vectorizer_title.transform(test['review_title'].fillna(''))
tfidf_description_test = tfidf_vectorizer_description.transform(test['review_description'].fillna(''))

# CONVERTIR TF-IDF EN DATAFRAMES
tfidf_title_test_df = pd.DataFrame(tfidf_title_test.toarray(), columns=tfidf_vectorizer_title.get_feature_names_out(), index=test.index)
tfidf_description_test_df = pd.DataFrame(tfidf_description_test.toarray(), columns=tfidf_vectorizer_description.get_feature_names_out(), index=test.index)

# CONCATENAR LOS RESULTADOS TF-IDF CON EL DATAFRAME ORIGINAL 'test'
test = pd.concat([test, tfidf_title_test_df, tfidf_description_test_df], axis=1)

# ELIMINAR COLUMNAS ORIGINALES DE TEXTO
test.drop(columns=['review_title', 'review_description'], inplace=True)

# VERIFICAMOS QUE TEST TENGA TODAS LAS COLUMNAS TF-IDF FALTANTES QUE ESTÉN EN TRAIN
missing_tfidf_columns = set(train.columns) - set(test.columns)
for col in missing_tfidf_columns:
    if col.startswith('review_title') or col.startswith('review_description'):  # Agregar columnas TF-IDF faltantes
        test[col] = 0

# VERIFICAMOS QUE TEST TENGA EL MISMO NÚMERO DE COLUMNAS QUE TRAIN
train_columns = set(train.columns) - {'reviewer_rating'}
test_columns = set(test.columns)
assert train_columns == test_columns, "Train y Test no tienen las mismas columnas (excepto 'reviewer_rating')"

test.head()

Unnamed: 0,book_title,book_price,rating,reviewer,is_verified,genre_ Action,genre_ Activities,genre_ Adult,genre_ Adventure,genre_ Alphabet,...,were,what,when,which,who,will,with,would,you,your
280,How to Know a Person: The Art of Seeing Others...,19.89,4.5,J.M. Ryan,1,False,False,False,False,False,...,0.0,0.039245,0.0,0.0,0.0,0.042132,0.030076,0.043924,0.0,0.0
280,How to Know a Person: The Art of Seeing Others...,19.89,4.5,J.M. Ryan,1,False,False,False,False,False,...,0.0,0.039245,0.0,0.0,0.0,0.042132,0.030076,0.043924,0.0,0.0
280,How to Know a Person: The Art of Seeing Others...,19.89,4.5,J.M. Ryan,1,False,False,False,False,False,...,0.0,0.039245,0.0,0.0,0.0,0.042132,0.030076,0.043924,0.0,0.0
434,How to Catch a Dinosaur,4.66,4.8,JoeSanch,1,False,False,False,False,False,...,0.0,0.293592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
434,How to Catch a Dinosaur,4.66,4.8,JoeSanch,1,False,False,False,False,False,...,0.0,0.293592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# EXPORTAMOS LOS DF
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)