In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define the features and the target variable
features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Category', 'Price_Tier', 'Visibility_Bins','Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']
target = 'Item_Outlet_Sales'


# Feature Engineering
# Extract Item_Category from Item_Identifier
train_data['Item_Category'] = train_data['Item_Identifier'].apply(lambda x: x[:2])
test_data['Item_Category'] = test_data['Item_Identifier'].apply(lambda x: x[:2])

train_data['Outlet_Age'] = 2013 - train_data['Outlet_Establishment_Year']
test_data['Outlet_Age'] = 2013 - test_data['Outlet_Establishment_Year']
print(train_data['Outlet_Age'] )
# Bin Item_MRP into price tiers
train_data['Price_Tier'] = pd.cut(train_data['Item_MRP'], bins=[0, 50, 100, 150, 200, 250], labels=[1, 2, 3, 4, 5])
test_data['Price_Tier'] = pd.cut(test_data['Item_MRP'], bins=[0, 50, 100, 150, 200, 250], labels=[1, 2, 3, 4, 5])

# Bin Item_Visibility into categories
train_data['Visibility_Bins'] = pd.cut(train_data['Item_Visibility'], bins=[-1, 0.1, 0.2, 0.3, 1], labels=[1, 2, 3, 4])
test_data['Visibility_Bins'] = pd.cut(test_data['Item_Visibility'], bins=[-1, 0.1, 0.2, 0.3, 1], labels=[1, 2, 3, 4])

# Handle missing values
train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(), inplace=True)
test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True)

train_data['Outlet_Size'].fillna(train_data['Outlet_Size'].mode()[0], inplace=True)
test_data['Outlet_Size'].fillna(test_data['Outlet_Size'].mode()[0], inplace=True)

# Drop unnecessary columns
train_data.drop(['Item_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)
test_data.drop(['Item_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

# Define categorical and numerical columns
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Category', 'Price_Tier', 'Visibility_Bins']
numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']

# label_encoders = {}


# for col in categorical_cols:
#     le = OneHotEncoder()
#     train_data[col] = le.fit_transform(train_data[col])
#     test_data[col] = le.transform(test_data[col])
#     label_encoders[col] = le


# Split the data into training and testing sets
X_train, y_train = train_data[features], train_data[target]
X_test = test_data[features] # Only extract features from test_df
#y_test = test_data[target]
#-----------------------------

# def preprocess_numerical(df, numerical_cols):
#     scaler = StandardScaler()
#     df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_cols]),
#                              columns=numerical_cols,
#                              index=df.index)
#     return df_scaled, scaler

# def preprocess_categorical(df, categorical_cols):
#     encoder = OneHotEncoder( handle_unknown='ignore')
#     encoded_data = encoder.fit_transform(df[categorical_cols])
#     encoded_df = pd.DataFrame(encoded_data,
#                               columns=encoder.get_feature_names_out(categorical_cols),
#                               index=df.index)
#     return encoded_df, encoder

# def preprocess_data(df, numerical_cols, categorical_cols):
#     # Preprocess numerical data
#     df_numerical, num_scaler = preprocess_numerical(df, numerical_cols)

#     # Preprocess categorical data
#     df_categorical, cat_encoder = preprocess_categorical(df, categorical_cols)

#     # Combine the preprocessed data
#     df_preprocessed = pd.concat([df_numerical, df_categorical], axis=1)

#     return df_preprocessed, num_scaler, cat_encoder


# # Assuming df is your DataFrame
# X_train_preprocessed, X_train_num_scaler, X_train_cat_encoder = preprocess_data(X_train, numerical_cols, categorical_cols)
# X_test_preprocessed, _, _ = preprocess_data(X_test, numerical_cols, categorical_cols)
#----------------------------


# Scale the data using MinMaxScaler
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Define the neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu')
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Preprocessing pipeline for base models
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# Train the model
model.fit(X_train_processed, y_train, epochs=100, batch_size=32)

# Make predictions on the test set
y_pred_train = model.predict(X_train_processed) # Predict 'Item_Outlet_Sales'

# You can now further process 'y_pred' (predicted sales)
# e.g., create a new column in 'test_df' with the predictions:
# test_df['Predicted_Item_Outlet_Sales'] = y_pred


# Evaluate the model
#rmse = np.sqrt(np.mean((y_train - y_pred_train) ** 2))
#print('RMSE:', rmse)

0       14
1        4
2       14
3       15
4       26
        ..
8518    26
8519    11
8520     9
8521     4
8522    16
Name: Outlet_Age, Length: 8523, dtype: int64
Epoch 1/100


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Item_Weight'].fillna(test_data['Item_Weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never w

[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 7420727.5000
Epoch 2/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3241908.2500
Epoch 3/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2386687.7500
Epoch 4/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1819641.7500
Epoch 5/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1256857.2500
Epoch 6/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1237350.8750
Epoch 7/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1174846.3750
Epoch 8/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1199810.2500
Epoch 9/100
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1157031.1250
Epoch 10/100
[1m267/267[0m [32m━━━━━━

ValueError: operands could not be broadcast together with shapes (8523,) (8523,16) 

In [24]:
y_pred_test = model.predict(X_test_processed) # Predict 'Item_Outlet_Sales'


[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [27]:
test_data.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Category', 'Outlet_Age', 'Price_Tier',
       'Visibility_Bins'],
      dtype='object')

In [36]:
# Prepare Submission File
submission = pd.DataFrame({
    'Item_Identifier': pd.read_csv('test.csv')['Item_Identifier'],
    'Outlet_Identifier': pd.read_csv('test.csv')['Outlet_Identifier'],
    'Item_Outlet_Sales': y_pred_test[:,0]
})

submission.to_csv('submission8.csv', index=False)
print("Submission file created successfully!")

Submission file created successfully!


In [32]:
len(y_pred_test)

5681

In [34]:
y_pred_test.shape

(5681, 16)

In [37]:
from google.colab import files
files.download("submission8.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>