In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

In [113]:
complex_data = pd.read_csv('complex_synthetic_cross_sell_dataset_with_customerID.csv')
complex_data.head()

Unnamed: 0,CustomerID,Age,Gender,Number_of_Transactions,Average_Transaction_Value,Last_Purchase_Days_Ago,Product_Category,Total_Spending,Cross_Sell
0,4a17db2f-695a-4101-bace-5c7156774bbe,56,Male,17,29.404205,200,Home,813.210552,1
1,09ab7872-7c12-4ef9-9895-1e20eca88ad6,69,Female,15,37.197897,35,Books,76.460661,1
2,9eb309fa-b7d9-4c43-a2ae-d8439b6d19d3,46,Female,6,138.596,2,Books,547.106804,1
3,5a443c5f-08b6-496d-9463-037e13dfe076,32,Male,16,136.480027,289,Clothing,1834.08433,1
4,178a502d-6d52-477f-a74b-039611e4e238,60,Male,17,25.949395,332,Beauty,1390.707408,1


### Gender and Product Cat Mapping

In [114]:
def convert_gender(gender):
    return 1 if gender=='Male' else 0

prodcat_mapping = {'Electronics': 1,
                  'Clothing': 2,
                  'Books': 3,
                  'Home': 4,
                  'Beauty': 5}
complex_data['Product_Category'] = complex_data['Product_Category'].map(prodcat_mapping)

complex_data['Gender'] = complex_data['Gender'].apply(lambda x: convert_gender(x))
complex_data

Unnamed: 0,CustomerID,Age,Gender,Number_of_Transactions,Average_Transaction_Value,Last_Purchase_Days_Ago,Product_Category,Total_Spending,Cross_Sell
0,4a17db2f-695a-4101-bace-5c7156774bbe,56,1,17,29.404205,200,4,813.210552,1
1,09ab7872-7c12-4ef9-9895-1e20eca88ad6,69,0,15,37.197897,35,3,76.460661,1
2,9eb309fa-b7d9-4c43-a2ae-d8439b6d19d3,46,0,6,138.596000,2,3,547.106804,1
3,5a443c5f-08b6-496d-9463-037e13dfe076,32,1,16,136.480027,289,2,1834.084330,1
4,178a502d-6d52-477f-a74b-039611e4e238,60,1,17,25.949395,332,5,1390.707408,1
...,...,...,...,...,...,...,...,...,...
4995,e0b11833-073a-48ff-84ff-32fd6a60b50a,24,1,17,188.566985,7,2,59.302000,0
4996,23eb2e4f-bd6e-4389-9286-31ea2a02052f,66,0,5,192.755379,199,4,179.978404,0
4997,ee8cb680-2424-4c6d-b04f-b7081f68ebeb,26,0,12,194.414263,91,2,93.659347,0
4998,c62dbea0-6ac2-4713-b234-dfbe1a984956,53,0,9,132.615493,147,2,1557.667845,1


### Train the Model

In [115]:
X = data.drop('Cross_Sell', axis=1)
y = data['Cross_Sell']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Logistic Regression

In [126]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr_model = lr.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / FScore: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.591 / Recall: 0.517 / FScore: 0.551 / Accuracy: 0.608


#### Random Forest

In [125]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)

rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / FScore: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.542 / Recall: 0.493 / FScore: 0.516 / Accuracy: 0.569


### Save the Model

In [120]:
import joblib
joblib.dump(model, 'cross_sell_model.pkl')

['cross_sell_model.pkl']

### Make Predictions

In [121]:
loaded_model = joblib.load('cross_sell_model.pkl')
new_data = pd.read_csv('synthetic_cross_sell_dataset_new.csv')
X = new_data.drop('Cross_Sell', axis=1)
y = new_data['Cross_Sell']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
new_predictions = loaded_model.predict(X_test)

In [122]:
result = pd.DataFrame(new_predictions)
result.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1


## Train model

In [107]:
X = complex_data.drop('Cross_Sell', axis=1)
y = complex_data['Cross_Sell']

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf_model = rf.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / FScore: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.717 / Recall: 0.805 / FScore: 0.758 / Accuracy: 0.665
