This notebook contains a very early prototype for a LogisticRegression machine learning model.

There is much more that needs to be done to flesh out this solution.  We are predicting the probability of a order item being cancelled, but it may make more sense to predict the probability of the overall order being cancelled.

In [1]:
!pip install sklearn
!pip install pandas



In [2]:
from sklearn import cluster
import pandas as pd

In [3]:
ONLINE_RETAIL_XLSX  = '../data/OnlineRetail.xlsx'

In [4]:
df = pd.read_excel(ONLINE_RETAIL_XLSX, sheetname='Online Retail')

In [5]:
df.columns

Index([u'InvoiceNo', u'StockCode', u'Description', u'Quantity', u'InvoiceDate',
       u'UnitPrice', u'CustomerID', u'Country'],
      dtype='object')

In [6]:
# If this code starts with letter 'c', it indicates a cancellation. 
df['Cancelled'] = df['InvoiceNo'].str.startswith('C')

mask = df['Cancelled'] == True
df.loc[mask, 'Cancelled'] = 1

mask = df['Cancelled'].isnull()
df.loc[mask, 'Cancelled'] = 0

df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Cancelled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom,0
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom,0
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,0
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,0
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom,0


In [7]:
df['Cancelled'].value_counts()

0    532621
1      9288
Name: Cancelled, dtype: int64

In [8]:
df['Quantity'] = df['Quantity'].abs()
df['Quantity'].describe()

count    541909.000000
mean         11.340487
std         217.995482
min           1.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       80995.000000
Name: Quantity, dtype: float64

In [9]:
df['UnitPrice'] = df['UnitPrice'].abs()
df['UnitPrice'].describe()

count    541909.000000
mean          4.692766
std          96.755927
min           0.000000
25%           1.250000
50%           2.080000
75%           4.130000
max       38970.000000
Name: UnitPrice, dtype: float64

In [10]:
# Remove rows where CustomerID is null
#df = df[pd.notnull(df['CustomerID'])] 

df.dropna(subset=['CustomerID'], how='all', inplace=True)

In [11]:
df['CustomerID'] = df['CustomerID'].astype(int)

In [12]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

In [13]:
X = df[['UnitPrice', 'Quantity', 'CustomerID']]
Y = df['Cancelled']

In [14]:
logistic.fit(X.values, list(Y.values))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
# predict one row
test = df[['UnitPrice', 'Quantity', 'CustomerID']].iloc[[2]]
logistic.predict(test)

array([0])

In [26]:
# predict all rows
test = df[['UnitPrice', 'Quantity', 'CustomerID']]
prediction = logistic.predict(test)
pd.DataFrame(prediction).describe()

Unnamed: 0,0
count,406829.0
mean,0.000101
std,0.010038
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [30]:
# predict all rows with probability
prediction = logistic.predict_proba(test)
p_df = pd.DataFrame(prediction)
p_df.head()

Unnamed: 0,0,1
0,0.989178,0.010822
1,0.989154,0.010846
2,0.989171,0.010829
3,0.989154,0.010846
4,0.989154,0.010846
