In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
data = pd.read_csv(r'data\03_int\int_example_customer_simplified_no_transaction_matching.csv')

# Feature Engineering
# Create a feature for time since the last offer received
data['time_since_last_offer_received'] = data.groupby('customer_id')['time_hrs'].diff().fillna(0)

# Create a feature for time since the last offer viewed
data['time_since_last_offer_viewed'] = data.apply(lambda row: row['time_hrs'] - data[(data['customer_id'] == row['customer_id']) & (data['event'] == 'offer viewed') & (data['time_hrs'] < row['time_hrs'])]['time_hrs'].max() if any((data['customer_id'] == row['customer_id']) & (data['event'] == 'offer viewed') & (data['time_hrs'] < row['time_hrs'])) else 0, axis=1)

# Create a feature for time since the last transaction
data['time_since_last_transaction'] = data.apply(lambda row: row['time_hrs'] - data[(data['customer_id'] == row['customer_id']) & (data['event'] == 'transaction') & (data['time_hrs'] < row['time_hrs'])]['time_hrs'].max() if any((data['customer_id'] == row['customer_id']) & (data['event'] == 'transaction') & (data['time_hrs'] < row['time_hrs'])) else 0, axis=1)

# Filter for transactions
transactions = data[data['event'] == 'transaction']

# Drop rows with missing values
transactions = transactions.dropna(subset=['offer_id'])

# Encoding categorical variables
transactions['offer_id'] = transactions['offer_id'].astype('category').cat.codes
transactions['offer_type'] = transactions['offer_type'].astype('category').cat.codes

# Define features and target
X = transactions[['time_since_last_offer_received', 'time_since_last_offer_viewed', 'time_since_last_transaction', 'transaction_amount', 'offer_type']]
y = transactions['offer_id']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Use the model to predict the offer_id for new transactions
new_transactions = pd.DataFrame({'time_since_last_offer_received': [value1], 'time_since_last_offer_viewed': [value2], 'time_since_last_transaction': [value3], 'transaction_amount': [value4], 'offer_type': [value5]})
predicted_offer_id = model.predict(new_transactions)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.