In [33]:
import numpy as np
import pandas as pd
import requests
import zipfile
import io
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

In [55]:
# URL do arquivo
url = "https://raw.githubusercontent.com/klaytoncastro/idp-machinelearning/main/resources/online_retail.zip"

# Fazendo o download e descompactando o arquivo zip em memória
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('online_retail_dataset.csv') as f:
        df = pd.read_csv(f)

# Remove cancelled orders
df = df[~df['InvoiceNo'].str.startswith('C')].copy()
    
# Convert datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Hour'] = df['InvoiceDate'].dt.hour
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
    
# Handle categorical variables
le_stock = LabelEncoder()
le_customer = LabelEncoder()
le_product = LabelEncoder()
    
# Transform categorical column
df['StockCode'] = le_stock.fit_transform(df['StockCode'])
df['CustomerID'] = le_customer.fit_transform(df['CustomerID'].astype(str))
df['Description'] = le_customer.fit_transform(df['Description'].astype(str))
    
# Convert Country to frequency encoding
country_freq = df['Country'].value_counts(normalize=True)
df['Country'] = df['Country'].map(country_freq)

# Normalize numeric features
scaler = MinMaxScaler()
numeric_features = ['Quantity', 'UnitPrice', 'Hour', 'DayOfWeek']
df[numeric_features] = scaler.fit_transform(df[numeric_features].astype('float32'))

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Supondo que `df` já está preparado e contém `CustomerID` e `StockCode`
# Remover duplicatas para garantir que cada cliente-produto seja único
df_unique = df[['CustomerID', 'StockCode']].drop_duplicates()

# Aplicar OneHotEncoder em 'StockCode'
encoder = OneHotEncoder(sparse_output=False)

# Criar uma coluna com a codificação OneHot de 'StockCode'
encoded_products = encoder.fit_transform(df_unique[['StockCode']])

# Converter a matriz codificada em um DataFrame
encoded_df = pd.DataFrame(encoded_products)

# Concatenar o CustomerID com a matriz codificada
df_encoded = pd.concat([df_unique[['CustomerID']].reset_index(drop=True), encoded_df], axis=1)

# Agrupar por 'CustomerID' e somar para obter uma linha por cliente
df_one_hot = df_encoded.groupby('CustomerID').sum()

print(df_one_hot)

            0     1     2     3     4     5     6     7     8     9     ...  \
CustomerID                                                              ...   
0            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
1            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

            4049  4050  4051  4052  4053  4054  4055  4056  4057  4058  
CustomerID                                                              
0            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1            0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2            0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3            0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4       

In [90]:
X = df_one_hot.iloc[:, 1:].values  # Ignora a primeira coluna
X = np.where(X == 1.0, 1, 0)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 1 ... 1 1 1]]


In [84]:
# Criando Y: supondo que queremos prever a compra do Item 0
y = df_one_hot[0].values  # Supondo que a coluna 0 indica se o Item 0 foi comprado

# Convertendo y para o formato correto
y = y.reshape(-1, 1)  # Para ter uma matriz com uma única coluna

In [88]:
# Divisão dos dados em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criando a RNA (MLP)
model = Sequential()
model.add(Dense(8, input_dim=4058, activation='relu'))  # Camada de entrada
model.add(Dense(4, activation='relu'))  # Camada intermediária
model.add(Dense(1, activation='sigmoid'))  # Saída binária

# Compilar o modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treinar o modelo
model.fit(X_train, y_train, epochs=100)

# Avaliar o modelo
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

# Fazer previsões
predictions = model.predict(X)
print("Predictions:\n", predictions)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9800 - loss: 0.4307
Epoch 2/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9890 - loss: 0.2535
Epoch 3/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9914 - loss: 0.1251
Epoch 4/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9909 - loss: 0.0832
Epoch 5/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9890 - loss: 0.0518
Epoch 6/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9900 - loss: 0.0373
Epoch 7/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9926 - loss: 0.0256
Epoch 8/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9914 - loss: 0.0227
Epoch 9/100
[1m109/109[0m [32m━━━━━━━━━━━