In [2]:
import pandas as pd
import gc
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import lightgbm as lgb

In [3]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [27]:
# import my method from the source code
%aimport data.read_data
%aimport models.train_model
from data.read_data import read
from models.train_model import split_train

In [5]:
PATH = '../data/raw'
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 40000
train, test = read(PATH)

In [6]:
train.head()

Unnamed: 0,ID,review_content,review_title,review_stars,product,Target
0,0,En appelant un acheteur pour demander si l'écr...,La Police s'inscrit en acheteur privé sur Pric...,5,2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...,0
1,1,"Alors, là, on a affaire au plus grand Navet ja...",Chef D'Oeuvre Absolu en vue...,5,7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...,1
2,2,Effet garanti sur la terrase. Ils donnent immé...,Effet garanti sur la terrase. Ils donnent immé...,3,7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...,0
3,3,tres bon rapport qualite prix tre pratique en ...,bon produit,4,77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...,1
4,4,Ordinateur de bureau trés bien pour quelqu'un ...,Apple Power MAC G4,3,f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...,1


In [7]:
nrow_train = train.shape[0]
y = train['Target']

In [8]:
merge: pd.DataFrame = pd.concat([train, test])

In [9]:
del train
del test
gc.collect()

19

In [10]:
def handle_missing_inplace(dataset):
    dataset['review_content'].fillna(value='missing', inplace=True)
    dataset['review_title'].fillna(value='missing', inplace=True)

In [11]:
def to_categorical(dataset):
    dataset['review_stars'] = dataset['review_stars'].astype('category')

In [12]:
handle_missing_inplace(merge)

In [13]:
to_categorical(merge)

In [14]:
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                     ngram_range=(1, 3),
                     stop_words='english')
X_description = tv.fit_transform(merge['review_content'])

In [15]:
X_dummies = csr_matrix(pd.get_dummies(merge['review_stars'],
                                          sparse=True).values)

In [16]:
sparse_merge = hstack((X_dummies, X_description)).tocsr()

In [17]:
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]


In [19]:
d_train = lgb.Dataset(X, label=y)#, max_bin=8192)

In [21]:
params = {
        'learning_rate': 0.75,
        'application': 'binary',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'auc',
    }

model = lgb.train(params, train_set=d_train, num_boost_round=3335,  \
verbose_eval=100) 
preds = 0.6*model.predict(X_test)

In [22]:
model = Ridge(solver="sag", fit_intercept=True, random_state=205)
model.fit(X, y)
preds += 0.4*model.predict(X=X_test)