In [30]:
import pandas as pd
import xgboost as xgb

In [31]:
rocket_df=pd.read_csv("C:/Users/amish/OneDrive/Desktop/RekMetric/retailrocket_cleaned.csv")

In [32]:
rocket_df.head()

Unnamed: 0,timestamp,visitorid,itemid,transactionid,is_transaction,0,1,10,100,1000,...,996,997,998,999,available,categoryid,parentid,event_addtocart,event_transaction,event_view
0,1433221332117,257597,355908,,0,,,,,,...,,,,,1.0,,,False,False,True
1,1433224214164,992329,248676,,0,,,,,,...,,,,,,,,False,False,True
2,1433221999827,111016,318965,,0,,,,,,...,,,,,,,,False,False,True
3,1433221955914,483717,253185,,0,,,,,,...,,,,,,,,False,False,True
4,1433221337106,951259,367447,,0,119932.0,,,,,...,,,,,,,,False,False,True


In [33]:
rocket_df=rocket_df.drop(columns=['transactionid'])

In [34]:
y=rocket_df['is_transaction']
rocket_df=rocket_df.drop(columns=['is_transaction'])

In [35]:
rocket_df.columns

Index(['timestamp', 'visitorid', 'itemid', '0', '1', '10', '100', '1000',
       '1001', '1002',
       ...
       '996', '997', '998', '999', 'available', 'categoryid', 'parentid',
       'event_addtocart', 'event_transaction', 'event_view'],
      dtype='object', length=916)

In [36]:
dropped_events=rocket_df[['event_transaction','event_addtocart','event_view']]
rocket_df=rocket_df.drop(columns=['event_transaction','event_addtocart','event_view'])

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(rocket_df,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, confusion_matrix, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    tree_method='hist',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

pipeline = Pipeline([
    ('var_thresh', VarianceThreshold(threshold=0.01)),  # remove low-variance features
    ('scaler', StandardScaler()),
    ('xgb', xgb_clf)
])

param_dist = {
    'xgb__n_estimators': [100, 150],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__max_depth': [5, 7],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
}

scorer = make_scorer(average_precision_score, needs_proba=True)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring=scorer,
    cv=2,
    verbose=2,
    random_state=42,
    n_jobs=1  
)

random_search.fit(X_train, y_train)

best_pipeline = random_search.best_estimator_

y_pred = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)[:, 1]

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
print("PR AUC Score:", average_precision_score(y_test, y_proba))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
recommendations = pd.DataFrame({
    'index': X_test.index,
    'score': y_proba
})

recommendations = recommendations.join(rocket_df, on='index')

recommendations = recommendations.sort_values(by='score', ascending=False)

top_n = 10
top_recommendations = recommendations.head(top_n)

styled_top = top_recommendations[['visitorid', 'itemid', 'score']].style \
    .set_caption("🔥 Top Recommendations from X_test") \
    .background_gradient(subset=['score'], cmap='YlGnBu') \
    .format({'score': '{:.4f}'}) \
    .bar(subset=['score'], color='#FFA07A') \
    .set_properties(**{'text-align': 'center'})

styled_top


Unnamed: 0,visitorid,itemid,score
7696,138131,301721,0.9088
11257,503857,460553,0.8924
23164,138131,14932,0.8816
31286,1093035,27812,0.8661
13670,1093035,27812,0.8661
38468,138131,138427,0.8646
9696,72155,390712,0.8635
38335,494335,460553,0.8635
28675,494335,460553,0.8635
24291,494335,460553,0.8635
