In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler

In [2]:
# DIVIDIMOS EL DATASET EN TRAIN Y TEST
train = pd.read_csv('../data/datos.csv')

In [3]:
# CARGAR STOP WORDS
stop_words = stopwords.words('english')

In [4]:
train.head()

Unnamed: 0,rank,book_title,book_price,rating,author,year_of_publication,genre,url,review_title,reviewer,reviewer_rating,review_description,is_verified,date,timestamp,asin
0,2,The Woman in Me,20.93,4.5,Britney Spears,2023,Memoir,amazon.com/Woman-Me-Britney-Spears/dp/16680090...,Unbelievably impressive. Her torn life on paper.,Murderess Marbie,4,I'm only a third way in. Shipped lightening fa...,True,26-10-2023,"Reviewed in the United States October 26, 2023",1668009048
1,2,The Woman in Me,20.93,4.5,Britney Spears,2023,Memoir,amazon.com/Woman-Me-Britney-Spears/dp/16680090...,What a heartbreaking story,L J,5,"""There have been so many times when I was scar...",True,06-11-2023,"Reviewed in the United States November 6, 2023",1668009048
2,2,The Woman in Me,20.93,4.5,Britney Spears,2023,Memoir,amazon.com/Woman-Me-Britney-Spears/dp/16680090...,Britney you are so invincible! You are an insp...,Jamie,5,The media could not be loaded. I personally ha...,True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048
3,2,The Woman in Me,20.93,4.5,Britney Spears,2023,Memoir,amazon.com/Woman-Me-Britney-Spears/dp/16680090...,"Fast Read, Sad Story",KMG,5,I have been a fan of Britney's music since the...,True,25-10-2023,"Reviewed in the United States October 25, 2023",1668009048
4,2,The Woman in Me,20.93,4.5,Britney Spears,2023,Memoir,amazon.com/Woman-Me-Britney-Spears/dp/16680090...,"Buy it, it’s worth the read!",Stephanie Brown,5,"Whether or not you’re a fan, it’s a great read...",True,01-11-2023,"Reviewed in the United States November 1, 2023",1668009048


In [5]:
# Procesamiento de Train
train = train.drop(columns=['rank', 'url', 'timestamp', 'asin', 'author', 'year_of_publication', 'date', 'asin', 'book_price','is_verified'])

# Separar los géneros
train['genre'] = train['genre'].str.split(',')
train = train.explode('genre')
train = pd.get_dummies(train, columns=['genre'], drop_first=False)

# Procesamiento de las reseñas de los usuarios con TF-IDF ajustado
tfidf_vectorizer_title = TfidfVectorizer(max_features=100, stop_words=stop_words, max_df=0.8, min_df=5)
tfidf_vectorizer_description = TfidfVectorizer(max_features=100, stop_words=stop_words, max_df=0.8, min_df=5)

# Aplicar TF-IDF
tfidf_title = tfidf_vectorizer_title.fit_transform(train['review_title'].fillna(''))
tfidf_description = tfidf_vectorizer_description.fit_transform(train['review_description'].fillna(''))

# Convertir TF-IDF en DataFrames con prefijos
tfidf_title_df = pd.DataFrame(tfidf_title.toarray(), columns=[f'word_title_{word}' for word in tfidf_vectorizer_title.get_feature_names_out()], index=train.index)
tfidf_description_df = pd.DataFrame(tfidf_description.toarray(), columns=[f'word_desc_{word}' for word in tfidf_vectorizer_description.get_feature_names_out()], index=train.index)

# Eliminar columnas de TF-IDF con palabras de longitud menor a 3
tfidf_title_df = tfidf_title_df.loc[:, [col for col in tfidf_title_df.columns if len(col) > len("word_title_") + 2]]
tfidf_description_df = tfidf_description_df.loc[:, [col for col in tfidf_description_df.columns if len(col) > len("word_desc_") + 2]]

# Concatenarlos
train = pd.concat([train, tfidf_title_df, tfidf_description_df], axis=1)

# Eliminar columnas originales
train.drop(columns=['review_title', 'review_description'], inplace=True)

train.head()

Unnamed: 0,book_title,rating,reviewer,reviewer_rating,genre_ Action,genre_ Activities,genre_ Adult,genre_ Adventure,genre_ Alphabet,genre_ Animals,...,word_desc_two,word_desc_want,word_desc_way,word_desc_well,word_desc_work,word_desc_world,word_desc_would,word_desc_writing,word_desc_year,word_desc_years
0,The Woman in Me,4.5,Murderess Marbie,4,False,False,False,False,False,False,...,0.0,0.0,0.299337,0.0,0.092395,0.0,0.291034,0.268176,0.0,0.0
1,The Woman in Me,4.5,L J,5,False,False,False,False,False,False,...,0.0,0.0,0.127401,0.0,0.0,0.0,0.123868,0.0,0.0,0.0
2,The Woman in Me,4.5,Jamie,5,False,False,False,False,False,False,...,0.081837,0.240994,0.0,0.067106,0.083909,0.077308,0.132151,0.0,0.0,0.080612
3,The Woman in Me,4.5,KMG,5,False,False,False,False,False,False,...,0.093898,0.092171,0.23393,0.0,0.0,0.0,0.227441,0.093146,0.0,0.277478
4,The Woman in Me,4.5,Stephanie Brown,5,False,False,False,False,False,False,...,0.0,0.0,0.0,0.251972,0.0,0.290281,0.0,0.0,0.0,0.0


In [6]:
# EXPORTAMOS LOS DF
train.to_csv('../data/train.csv', index=False)