# Week 6 Lab
## Model Training, Testing, and Validation

In [139]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

### `Load Cleaned Data`
#### Connection Variables

In [142]:
host = r'127.0.0.1'
db = r'MSDS610'
user = r'postgres'
pw = r'Pa55w0rd'
port = r'5432'
schema = r'cleaned'

#### Connection

In [145]:
db_conn = create_engine("postgresql://{}:{}@{}:{}/{}".format(user, pw, host, port, db))
table_name = r'sales_transaction'
schema = r'cleaned'

In [147]:
df = pd.read_sql_table(table_name, db_conn, schema)

#### Looking at the Data

In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536350 entries, 0 to 536349
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Date         536350 non-null  object 
 1   ProductNo    536350 non-null  object 
 2   ProductName  536350 non-null  object 
 3   Price        536350 non-null  float64
 4   Quantity     536350 non-null  int64  
 5   CustomerNo   536350 non-null  float64
 6   Country      536350 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 28.6+ MB


### Manual Features for Analysis

#### **Average Purchase Interval (Days)**

`Justification:` Customers who buy regularly with a shorter interval between purchases are more likely to return.

In [153]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Total Purchase Frequency (Number of purchases per customer)
df['Total_Purchase_Frequency'] = df.groupby('CustomerNo')['Date'].transform('count')

In [155]:
# Display top 5 results for Total Purchase Frequency
print("Total Purchase Frequency (Top 5 Customers):")
print(df[['CustomerNo', 'Total_Purchase_Frequency']].drop_duplicates().head())

Total Purchase Frequency (Top 5 Customers):
    CustomerNo  Total_Purchase_Frequency
0      17490.0                        85
1      13069.0                       470
20     12433.0                       419
84     13426.0                       158
94     17364.0                       414


#### **Total Revenue Per Customer**

`Justification:` High-spending customers are more likely to be repeat customers.

In [158]:
# Total Revenue Per Customer (Sum of Price * Quantity per customer)
df['Total_Revenue'] = df.groupby('CustomerNo').apply(lambda x: (x['Price'] * x['Quantity']).sum()).reindex(df['CustomerNo'].values).values

In [160]:
# Display top 5 results for Total Revenue Per Customer
print("\nTotal Revenue Per Customer (Top 5 Customers):")
print(df[['CustomerNo', 'Total_Revenue']].drop_duplicates().head())


Total Revenue Per Customer (Top 5 Customers):
    CustomerNo  Total_Revenue
0      17490.0       12716.73
1      13069.0       59185.61
20     12433.0      115454.49
84     13426.0       25603.59
94     17364.0       30012.93


#### Three Way Split

In [203]:
# Selecting relevant features and target variable (repeat customer classification)
# Assuming repeat customer = 1 if Total Purchase Frequency > 1, else 0
df['Repeat_Customer'] = (df['Total_Purchase_Frequency'] > 1).astype(int)

# Define feature set and target variable
features = ['Total_Purchase_Frequency',  'Total_Revenue']
target = 'Repeat_Customer'  # Assuming this is the target variable

# Splitting into Train (70%), Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(df[features], df[target], test_size=0.3, random_state=42)

# Splitting Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display dataset sizes
len(X_train), len(X_val), len(X_test)


(375445, 80452, 80453)

In [205]:
X_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160905 entries, 526866 to 417869
Data columns (total 2 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Total_Purchase_Frequency  160905 non-null  int64  
 1   Total_Revenue             160905 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 3.7 MB


In [207]:
X_temp.head()

Unnamed: 0,Total_Purchase_Frequency,Total_Revenue
526866,34,8188.08
250117,445,20477.88
484918,512,23798.21
180513,100,16632.03
172569,51,6952.64


In [209]:
y_temp.info()

<class 'pandas.core.series.Series'>
Index: 160905 entries, 526866 to 417869
Series name: Repeat_Customer
Non-Null Count   Dtype
--------------   -----
160905 non-null  int32
dtypes: int32(1)
memory usage: 1.8 MB


In [211]:
y_temp.head()

526866    1
250117    1
484918    1
180513    1
172569    1
Name: Repeat_Customer, dtype: int32

In [213]:
print(X_test.shape)
X_test.head()

(80453, 2)


Unnamed: 0,Total_Purchase_Frequency,Total_Revenue
170105,5800,873037.9
106183,186,11502.2
1643,298,37680.67
435997,414,30012.93
412638,5800,873037.9


In [215]:
print(X_val.shape)
X_val.head()

(80452, 2)


Unnamed: 0,Total_Purchase_Frequency,Total_Revenue
85382,5093,177814.26
208380,459,60969.38
106957,402,11825.83
79358,578,47573.55
373896,255,25297.87


In [217]:
print(y_test.shape)
y_test.head()

(80453,)


170105    1
106183    1
1643      1
435997    1
412638    1
Name: Repeat_Customer, dtype: int32

In [219]:
print(y_val.shape)
y_val.head()

(80452,)


85382     1
208380    1
106957    1
79358     1
373896    1
Name: Repeat_Customer, dtype: int32

### Building the Model

In [222]:
# Train a Random Forest classifier using training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [223]:
pred_X_test = model.predict(X_test)

In [224]:
accuracy_score(y_test,pred_X_test)

1.0

#### I can't get the accuracy to not be 1.0; the chosen dataset may not be the best for this excersise.

### Saving the Model

In [229]:
from joblib import dump

# Save the model using joblib
model_filename = "Week6_model.joblib"
dump(model, model_filename)

['Week6_model.joblib']