# Choosing XGBoost

In [154]:
import pandas as pd

# Load the dataset to inspect its structure
file_path = '/mnt/data/sales (2).csv'
data = pd.read_csv(r'C:\Users\fmrol\Documents\Ironhack\Classes Material\Week 2\Day 5\Project\sales.csv')

# Display the first few rows and summary of the dataset to understand its structure
data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           640840 non-null  int64 
 1   store_ID             640840 non-null  int64 
 2   day_of_week          640840 non-null  int64 
 3   date                 640840 non-null  object
 4   nb_customers_on_day  640840 non-null  int64 
 5   open                 640840 non-null  int64 
 6   promotion            640840 non-null  int64 
 7   state_holiday        640840 non-null  object
 8   school_holiday       640840 non-null  int64 
 9   sales                640840 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 48.9+ MB


(None,
    Unnamed: 0  store_ID  day_of_week        date  nb_customers_on_day  open  \
 0      425390       366            4  2013-04-18                  517     1   
 1      291687       394            6  2015-04-11                  694     1   
 2      411278       807            4  2013-08-29                  970     1   
 3      664714       802            2  2013-05-28                  473     1   
 4      540835       726            4  2013-10-10                 1068     1   
 
    promotion state_holiday  school_holiday  sales  
 0          0             0               0   4422  
 1          0             0               0   8297  
 2          1             0               0   9729  
 3          1             0               0   6513  
 4          1             0               0  10882  )

# Preprocessing phase

### Cleaning

##### 1 Checking for missing values

In [155]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])


Series([], dtype: int64)


In [156]:
###### 1.1 Dropping unnecesary column
data.drop(columns=['Unnamed: 0'], inplace=True)


In [157]:
# No missing values found

#### 2 Identify Non-Numeric Columns (Categorical Data)

In [158]:

categorical_columns = data.select_dtypes(include=['object']).columns
print(categorical_columns)


Index(['date', 'state_holiday'], dtype='object')


In [159]:
# Columns 'date' (ex.2013-04-18) and 'state_holiday' are non-numeric. We will convert them into numeric values
# state_holiday (The 0 values might be stored as strings ("0" instead of 0)) 

##### 2.1  Converting Non-Numeric Columns (Categorical Data)

###### 2.1.1 For 'Date'

In [160]:
# For 'Date': extracting the year, month and a day of the week as separate numerical columns

data['date'] = pd.to_datetime(data['date'])

data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek 

In [161]:
# Dropping the original date column

data.drop(columns=['date'], inplace=True)


In [162]:
# Check the newly created columns
print(data[['year', 'month', 'day']].head())

   year  month  day
0  2013      4   18
1  2015      4   11
2  2013      8   29
3  2013      5   28
4  2013     10   10


###### 2.1.2 For state_holiday

In [163]:
# Applying label encoding to 'state_holiday'

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['state_holiday'] = label_encoder.fit_transform(data['state_holiday'])

# Check unique values in the state_holiday column
print(data['state_holiday'].unique())

[0 1 3 2]


#### 3 Split the data into training and testing sets.


In [164]:
from sklearn.model_selection import train_test_split

# Define your features (all columns except 'sales') and target (the 'sales' column)
X = data.drop(columns=['sales'])
y = data['sales']

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting sets to ensure proper split
print(f"Training features: {X_train.shape}")
print(f"Testing features: {X_test.shape}")
print(f"Training target: {y_train.shape}")
print(f"Testing target: {y_test.shape}")


Training features: (512672, 10)
Testing features: (128168, 10)
Training target: (512672,)
Testing target: (128168,)


# Training the Model (Model chosen XGBoost)

#### 1 Install XGBoost


In [165]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


#### 2 Import required Libraries 


In [166]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score


#### 3 Train the XGBoost Model


In [167]:
# Initialize the XGBoost regressor model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)


#### 4 Make Predictions

In [168]:
# Predict on the test set
y_pred = xgb_model.predict(X_test)


#### 5 Evaluate the Model Performance

In [169]:
# Calculate Mean Squared Error (MSE) and R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")


Mean Squared Error: 1279696.8288832253
R2 Score: 0.9134267251359744


# Testing with Real-Life data