<a href="https://colab.research.google.com/github/danielecaliari/AML/blob/main/modelli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from scipy import sparse


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
root_dir = "/content/drive/MyDrive/"
base_dir = root_dir + 'progetto_AML/'

Mounted at /content/drive


In [3]:

label = pd.read_csv(base_dir + 'label.csv')
label = label['price'].values

Regression without embeddings

In [None]:
from scipy import sparse
train = sparse.load_npz(base_dir + 'train.npz')
test = sparse.load_npz(base_dir + 'test.npz')


In [None]:
label.shape

(1481661,)

In [None]:
def create_mlp(dim):
	# define our MLP network
  model = Sequential()
  model.add(Dense(256, input_dim=dim, activation="relu"))
  model.add(Dense(128, activation="relu"))
  model.add(Dense(64, activation="relu"))
  model.add(Dense(1, activation="linear"))
  return model

Split in training & validation

In [None]:
(train, validation, label_train, label_validation) = train_test_split(train , label , test_size=0.25, random_state=42)

In [None]:
validation

<370416x45811 sparse matrix of type '<class 'numpy.float64'>'
	with 12798615 stored elements in Compressed Sparse Row format>

In [None]:
train.shape[1]

45811

In [None]:
model = create_mlp(train.shape[1])
model.compile(loss="mean_squared_logarithmic_error")
model.fit(x=train, y=label_train, validation_data=(validation, label_validation), epochs=20, batch_size=128)

Prediction

In [None]:
preds = model.predict(test)
# fare inversa log

Word embedding

In [4]:
train_emb = sparse.load_npz(base_dir + 'train_emb.npz')
test_emb = sparse.load_npz(base_dir + 'test_emb.npz')

In [5]:
train = pd.read_csv(base_dir + 'train.tsv', sep='\t')
test = pd.read_csv(base_dir + 'test.tsv', sep='\t')

In [6]:
train = train[train['price']>0].reset_index(drop=True)

In [7]:
train.item_description=train.item_description.astype(str)
test.item_description=test.item_description.astype(str)

In [8]:
train_names = train['name']
train_descriptions = train['item_description']

train_names_test = train['name']
train_descriptions_test = train['item_description']

In [9]:
(train_emb, validation_emb, label_train_emb, label_validation_emb) = train_test_split(train_emb , label , test_size=0.25, shuffle= False)
(train_names, train_names_validation) = train_test_split(train_names, test_size=0.25, shuffle= False)
(train_descriptions, train_descriptions_validation) = train_test_split(train_descriptions, test_size=0.25, shuffle= False)

In [95]:
train_names

0            MLB Cincinnati Reds T Shirt Size XL
1               Razer BlackWidow Chroma Keyboard
2                                 AVA-VIV Blouse
3                          Leather Horse Statues
4                           24K GOLD plated rose
                           ...                  
1111896                    Pink Timberland Boots
1111897              Lose fat weight with acxion
1111898    New Victoria's Secret mesh tights. XS
1111899                          Gap deer outfit
1111900            Women's Miss Me jeans size 28
Name: name, Length: 1111901, dtype: object

In [10]:
names = []
for n in train_names:
  names.append(n)

In [11]:
descriptions = []
for n in train_descriptions:
  descriptions.append(n)

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=100000)
tokenizer.fit_on_texts(names)
sequences = tokenizer.texts_to_sequences(names)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

names_data = pad_sequences(sequences, maxlen=20)




Found 100802 unique tokens.


In [13]:
tokenizer = Tokenizer(nb_words=100000)
tokenizer.fit_on_texts(descriptions)
sequences = tokenizer.texts_to_sequences(descriptions)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

descriptions_data = pad_sequences(sequences, maxlen=20)



Found 178020 unique tokens.


In [14]:
train_emb.shape

(1111245, 5811)

In [15]:
descriptions_data.shape

(1111245, 20)

In [None]:
model = Sequential()
  model.add(Dense(256, input_dim=dim, activation="relu"))
  model.add(Dense(128, activation="relu"))
  model.add(Dense(64, activation="relu"))
  model.add(Dense(1, activation="linear"))

In [53]:
from tensorflow import keras
num_words_name = 100802  # Size of vocabulary obtained when preprocessing text data
num_words_description = 178020  # Size of vocabulary obtained when preprocessing text data


name_input = keras.Input(shape=(None,), name="name")  # Variable-length sequence of ints
description_input = keras.Input(shape=(None,), name="description")  # Variable-length sequence of ints
other_features_input = keras.Input(shape=(train_emb.shape[0],train_emb.shape[1]), name="other_features_input")

# Embed each word in the title into a 64-dimensional vector
name_features = keras.layers.Embedding(num_words_name, 32)(name_input)
# Embed each word in the text into a 64-dimensional vector
description_features = keras.layers.Embedding(num_words_description, 64)(description_input)

other_features_relu = keras.layers.Dense(128, activation="relu")(other_features_input)

# Merge all available features into a single large vector via concatenation
x = keras.layers.concatenate([name_features, description_features, other_features_relu])


# Stick a logistic regression for priority prediction on top of the features
price_pred = keras.layers.Dense(1, activation="linear", name="price")(x)

# Instantiate an end-to-end model predicting both priority and department
model = keras.Model(
    inputs=[name_input, description_input, other_features_input],
    outputs=[price_pred],
)

In [54]:
model.compile(
    optimizer=keras.optimizers.RMSprop(1e-3),
    loss={
        "price": keras.losses.MeanSquaredLogarithmicError,
    },
    loss_weights=[1.0],
)

In [59]:
model.fit(
    {"name": names_data, "description": descriptions_data, "other_features_input": train_emb},
    {"price": label_train_emb},
    validation_data=(train_names_validation, train_descriptions_validation, validation_emb, label_validation_emb),
    epochs=20,
    batch_size=128,
)

ValueError: ignored