## Neural Network using Keras Regressor

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
# Cargamos los datos
data = pd.read_csv('data_norm_vif_v2.csv', sep=';', encoding='utf-8')

In [3]:
data.columns

Index(['district_1', 'district_10', 'district_11', 'district_12',
       'district_13', 'district_14', 'district_15', 'district_16',
       'district_17', 'district_18',
       ...
       'WordCloudHighTri', 'WordCloudHighTriTail', 'WCSentAllTriTail',
       'RevLen', 'Points', 'TriLowProb', 'TriMedProb', 'TriHighProb', 'funny',
       'stars'],
      dtype='object', length=116)

In [4]:
features = list(data.columns)
features.remove('stars')

In [5]:
# Realizamos el split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[features], data['stars'], test_size=0.25, random_state=42)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6012, 115), (2004, 115), (6012,), (2004,))

In [7]:
#el modelo tiene por error base mse en entrenamiento y en test:
np.mean((y_train-np.mean(y_train))**2), np.mean((y_test-np.mean(y_test))**2)

(0.6546189819850016, 0.6193807494790937)

In [8]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

In [9]:
clf = ExtraTreesRegressor(random_state=42)
clf = clf.fit(X_train, y_train)
clf.feature_importances_      

array([  2.45193407e-03,   2.80305972e-03,   1.92045612e-03,
         5.33371001e-03,   4.50010405e-03,   2.91813300e-03,
         3.86670832e-03,   2.59856371e-03,   1.08490536e-02,
         8.21617010e-05,   7.90735022e-03,   2.64891039e-03,
         1.80049716e-04,   2.92718761e-03,   5.69007584e-03,
         2.98937207e-03,   4.33310652e-03,   5.04198115e-04,
         2.67524965e-03,   4.88294441e-03,   2.82967030e-06,
         1.44415905e-03,   1.86501152e-04,   7.47217908e-05,
         3.85894261e-05,   6.61209673e-04,   2.32755537e-04,
         4.61279412e-05,   1.07246046e-05,   8.22916716e-05,
         1.47938094e-04,   1.26629594e-04,   2.28527003e-03,
         7.17497709e-04,   2.34068216e-04,   3.83323439e-04,
         1.40264620e-05,   2.45515551e-03,   1.33230593e-03,
         2.05788711e-03,   5.36911290e-04,   1.06754285e-05,
         1.72306201e-03,   1.83710530e-03,   8.03805456e-04,
         4.14970506e-04,   8.04829095e-03,   8.45493798e-03,
         1.72351819e-04,

In [10]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X_train)
X_new.shape

(6012, 17)

In [11]:
# Ordenamos las features por su importancia
sorted(zip(features, clf.feature_importances_), key = lambda x: x[1])[::-1][:17]

[('Points', 0.34925123076683934),
 ('WCSentEng', 0.093214688073218949),
 ('NegCloud', 0.05523048901280446),
 ('RevLen', 0.041244761973345645),
 ('WordCloudHigh', 0.0378046680843835),
 ('WordCloudHighGer', 0.029430737999222901),
 ('WCSentGer', 0.026488612283482028),
 ('review_count', 0.024169742996141403),
 ('funny', 0.02034912677279567),
 ('TriLowProb', 0.01798397883778979),
 ('WordCloudHighTri', 0.016618081592012335),
 ('main_cat_Others', 0.016119848283084708),
 ('main_cat_Beauty & Spas', 0.012474579512593327),
 ('main_cat_Shopping', 0.011442853735780939),
 ('district_17', 0.01084905355970992),
 ('main_cat_Food', 0.0090514815070402259),
 ('TriHighProb', 0.0087954232085630246)]

In [12]:
most_imp = sorted(zip(features, clf.feature_importances_), key = lambda x: x[1])[::-1][:17]

In [13]:
cols_sel = [i[0] for i in most_imp]

In [14]:
cols_sel

['Points',
 'WCSentEng',
 'NegCloud',
 'RevLen',
 'WordCloudHigh',
 'WordCloudHighGer',
 'WCSentGer',
 'review_count',
 'funny',
 'TriLowProb',
 'WordCloudHighTri',
 'main_cat_Others',
 'main_cat_Beauty & Spas',
 'main_cat_Shopping',
 'district_17',
 'main_cat_Food',
 'TriHighProb']

In [15]:
cols_sel.remove('district_17')

In [16]:
len(cols_sel)

16

In [17]:
X_train2 = X_train[cols_sel]
X_test2 = X_test[cols_sel]

In [18]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [19]:
# Primera red con 1 capa de entrada (16n) y una capa de salida (1n)
model = Sequential()
model.add(Dense(16, kernel_initializer='uniform', activation='relu', input_shape=(X_train2.shape[1],)))
model.add(Dense(1, activation='linear'))
model.compile(loss='mae', optimizer='adam',metrics=['mse'])  

early_stop = EarlyStopping(monitor='val_mean_squared_error', patience=10, verbose=1)

model.fit(X_train2.values, y_train.values, epochs=50, validation_split=0.2,
          batch_size=4, callbacks=[early_stop])

score = model.evaluate(X_test2.values, y_test.values, batch_size=len(y_test))
score

Train on 4809 samples, validate on 1203 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 00030: early stopping


[0.46380159258842468, 0.37635517120361328]

In [20]:
# Segunda red con 1 capa de entrada (16n), una capa oculta (4n) y una capa de salida (1n)
model = Sequential()
model.add(Dense(16, kernel_initializer='uniform', activation='relu', input_shape=(X_train2.shape[1],)))
model.add(Dense(4, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mse'])

early_stop = EarlyStopping(monitor='val_mean_squared_error', patience=10, verbose=1)

model.fit(X_train2.values, y_train.values, epochs=50, validation_split=0.2,
          batch_size=4, callbacks=[early_stop])

score = model.evaluate(X_test2.values, y_test.values, batch_size=len(y_test))
score

Train on 4809 samples, validate on 1203 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 00027: early stopping


[0.46097105741500854, 0.37981972098350525]

In [22]:
# Tercera red con 1 capa de entrada (16n), dos capas ocultas (8n y 4n) y una capa de salida (1n)
model = Sequential()
model.add(Dense(16, kernel_initializer='uniform', activation='relu', input_shape=(X_train2.shape[1],)))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(4, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mae', optimizer='adam', metrics=['mse'])

early_stop = EarlyStopping(monitor='val_mean_squared_error', patience=20, verbose=1)

model.fit(X_train2.values, y_train.values, epochs=100, validation_split=0.2,
          batch_size=4, callbacks=[early_stop])

score = model.evaluate(X_test2.values, y_test.values, batch_size=len(y_test))
score

Train on 4809 samples, validate on 1203 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 00049: early stopping


[0.43825238943099976, 0.33742132782936096]