In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [2]:
train = pd.read_csv('csv/train.csv')
train = train.dropna(axis="columns")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4651 entries, 0 to 4650
Data columns (total 10 columns):
title               4651 non-null object
description         4651 non-null object
plot                4651 non-null object
csm_review          4651 non-null object
need_to_know        4651 non-null object
csm_rating          4651 non-null int64
Genre               4651 non-null object
Book type           4651 non-null object
Publication date    4651 non-null object
Last updated        4651 non-null object
dtypes: int64(1), object(9)
memory usage: 363.5+ KB


In [3]:
labelencoder = LabelEncoder()
tokenizer = Tokenizer()
embedding_columns = ['title','description', 'plot', 'csm_review', 'need_to_know']

def label_encoder(df):
    df.loc[:, "genre_cat"] = labelencoder.fit_transform(df.loc[:, "Genre"])
    df.loc[:, "book_type_cat"] = labelencoder.fit_transform(df.loc[:, "Book type"])
    
    return df

def create_embeddings(df):
    for column in embedding_columns:
        tokenizer.fit_on_texts(list(df[column]))
    
    for column in embedding_columns:
        df[column + "_seq"] = tokenizer.texts_to_sequences(df[column])
    
    return df

In [4]:
train = label_encoder(train)
train = create_embeddings(train)

In [5]:
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 45360 unique tokens.


In [6]:
X_train = train.loc[:, "genre_cat":"need_to_know_seq"]
y_train = train.loc[:, "csm_rating"]

In [7]:
train.head()

Unnamed: 0,title,description,plot,csm_review,need_to_know,csm_rating,Genre,Book type,Publication date,Last updated,genre_cat,book_type_cat,title_seq,description_seq,plot_seq,csm_review_seq,need_to_know_seq
0,"Bloodhound: Beka Cooper, Book 2",Second book in fantasy series is more mature t...,"BLOODHOUND finds young policewoman -- aka ""Dog...","Like its predecessor, Terrier, Bloodhound is m...",Parents need to know that this fantasy/crime s...,12,Fantasy,Fiction,"April 14, 2009","June 19, 2019",18,0,"[11635, 7237, 4825, 18, 450]","[312, 18, 6, 129, 58, 7, 44, 407, 85, 59]","[11635, 189, 67, 30433, 2849, 300, 7237, 4825,...","[52, 122, 4167, 7926, 11635, 7, 104, 44, 5, 3,...","[29, 37, 4, 34, 9, 16, 129, 1190, 28, 168, 242..."
1,"The School for Good and Evil, Book 1",Fractured fairy tale has plenty of twists for ...,When best friends Sophie and Agatha are stolen...,The School for Good and Evil is no run-of-the-...,Parents need to know that The School for Good ...,8,Fairy Tale,Fiction,"May 14, 2013","October 18, 2017",16,0,"[1, 55, 11, 94, 2, 317, 18, 354]","[4973, 419, 97, 38, 327, 5, 893, 11, 129, 266]","[35, 117, 88, 1469, 2, 7374, 15, 2092, 213, 26...","[1, 55, 11, 94, 2, 317, 7, 96, 538, 5, 1, 4743...","[29, 37, 4, 34, 9, 1, 55, 11, 94, 2, 317, 7, 3..."
2,"Twilight: The Twilight Saga, Book 1",Overlong but engrossing popular vampire romance.,"When her mother gets remarried, Bella Swan mov...",This incredibly long book is really two books....,"Parents need to know that, while very mild by ...",13,Fantasy,Fiction,"October 1, 2005","February 07, 2020",18,0,"[2826, 1, 2826, 1422, 18, 354]","[6144, 13, 2178, 357, 1023, 185]","[35, 10, 136, 157, 17696, 5602, 9792, 655, 6, ...","[16, 2660, 201, 18, 7, 194, 81, 113, 1, 59, 81...","[29, 37, 4, 34, 9, 99, 127, 796, 21, 1, 4398, ..."
3,The Lost Girl,"Engrossing story of an ""echo"" designed to repl...","Eva, a teen living in England, has human-like ...",Sangu Mandanna's debut novel is inspired by Fr...,Parents need to know that The Lost Girl is the...,14,Science Fiction,Fiction,"August 28, 2012","June 19, 2019",38,0,"[1, 295, 77]","[2178, 28, 5, 23, 7629, 2309, 4, 7306, 415, 77]","[4098, 3, 106, 402, 6, 1564, 38, 339, 52, 2926...","[36289, 36290, 895, 98, 7, 853, 21, 5389, 2, 1...","[29, 37, 4, 34, 9, 1, 295, 77, 7, 1, 28, 5, 40..."
4,"Warcross, Book 1",Winning teen girl drives cyberpunk virtual rea...,"As WARCROSS opens, bounty hunter Emika Chen fi...","Cyberpunk fell out of favor for a while, but t...",Parents need to know that Warcross is the firs...,13,Science Fiction,Fiction,"September 12, 2017","December 05, 2019",38,0,"[10191, 18, 354]","[522, 106, 77, 2561, 12765, 3670, 837, 821]","[12, 10191, 868, 8035, 2349, 30440, 14121, 189...","[12765, 3968, 45, 5, 4469, 11, 3, 99, 13, 16, ...","[29, 37, 4, 34, 9, 10191, 7, 1, 59, 627, 5, 3,..."


In [None]:
gensim_word2vec_tr = GensimWord2VecVectorizer(size=50, min_count=3, sg=1, alpha=0.025, iter=10)
xgb = XGBClassifier(learning_rate=0.01, n_estimators=100, n_jobs=-1)
w2v_xgb = Pipeline([
    ('w2v', gensim_word2vec_tr), 
    ('xgb', xgb)
])
w2v_xgb

In [10]:
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields title_seq, description_seq, plot_seq, csm_review_seq, need_to_know_seq

In [6]:
df_y = df.loc[:, "csm_rating"]

Unnamed: 0,genre_cat,book_type_cat,title_seq,description_seq,plot_seq,csm_review_seq,need_to_know_seq
0,32,0,"[1, 520, 1377]","[1151, 823, 26455, 17, 116, 1239]","[1088, 4510, 2, 17712, 8850, 33649, 318, 6, 23...","[1, 520, 1377, 37, 23, 541, 434, 1130, 280, 17...","[28, 38, 4, 34, 9, 1, 520, 1377, 7, 3, 611, 23..."
1,12,0,"[285, 12374]","[6678, 29, 5, 1911, 3526, 348, 6, 6364]","[49, 1, 348, 5, 8441, 84, 1803, 90, 78, 14648,...","[16, 226, 91, 48, 143, 3, 21611, 5, 5888, 139,...","[28, 38, 4, 34, 9, 285, 12374, 7, 1656, 21, 19..."
2,16,0,"[1, 55, 11, 94, 2, 329, 18, 347]","[4571, 436, 97, 37, 317, 5, 886, 11, 128, 264]","[35, 119, 83, 1273, 2, 7623, 15, 2140, 209, 27...","[1, 55, 11, 94, 2, 329, 7, 95, 535, 5, 1, 5098...","[28, 38, 4, 34, 9, 1, 55, 11, 94, 2, 329, 7, 3..."
3,38,0,"[3415, 5, 2115, 1, 2166, 4616, 3900, 18, 347]","[58, 540, 11089, 12, 106, 1070, 6701, 5, 1006,...","[179, 6, 7834, 3415, 5, 2115, 568, 3, 937, 90,...","[350, 1006, 50, 215, 230, 82, 3, 5049, 3539, 2...","[28, 38, 4, 34, 9, 3415, 5, 2115, 1, 2166, 461..."
4,23,0,"[4286, 14337, 6821]","[1723, 98, 568, 7568, 2318, 17, 17581, 1807]","[4286, 14337, 6821, 7, 3, 1342, 98, 27, 516, 5...","[827, 270, 2, 13165, 3161, 16, 399, 98, 7, 311...","[28, 38, 4, 34, 9, 4286, 14337, 6821, 21, 1062..."
