In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,12/1/2010 9:41,27.50,14527.0,United Kingdom
8963,537159,22112,CHOCOLATE HOT WATER BOTTLE,6,12/5/2010 13:17,4.95,14527.0,United Kingdom
8964,537159,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/5/2010 13:17,4.95,14527.0,United Kingdom
8965,537159,21479,WHITE SKULL HOT WATER BOTTLE,1,12/5/2010 13:17,3.75,14527.0,United Kingdom
8966,537159,22114,HOT WATER BOTTLE TEA AND SYMPATHY,6,12/5/2010 13:17,3.95,14527.0,United Kingdom
...,...,...,...,...,...,...,...,...
533807,581114,22111,SCOTTIE DOG HOT WATER BOTTLE,1,12/7/2011 12:19,4.95,14527.0,United Kingdom
533808,581114,22835,HOT WATER BOTTLE I AM SO POORLY,2,12/7/2011 12:19,4.95,14527.0,United Kingdom
533809,581114,22114,HOT WATER BOTTLE TEA AND SYMPATHY,6,12/7/2011 12:19,4.25,14527.0,United Kingdom
533810,581114,21479,WHITE SKULL HOT WATER BOTTLE,2,12/7/2011 12:19,4.25,14527.0,United Kingdom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [4]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
df = pd.read_csv('data.csv',encoding= 'unicode_escape')

def transform_datatypes_and_nulls(df):
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    df['CustomerID'].fillna(0,inplace=True)
    df['CustomerID'] = df['CustomerID'].astype(int)
    df['Quantity'] = df['Quantity'].astype(int)

def fill_null_values_in_descripctions(df):
    # Sort the DataFrame by 'StockCode' and 'InvoiceDate' to ensure rows with the same 'StockCode' are consecutive
    df.sort_values(by=['StockCode', 'InvoiceDate'], inplace=True)
    # Use forward-fill (ffill) to replace null values in 'Description' column
    df['Description'].fillna(method='ffill', inplace=True)
    
transform_datatypes_and_nulls(df)
fill_null_values_in_descripctions(df)
df.sort_values(by=['InvoiceDate','InvoiceNo'],inplace=True)
df = df.reset_index(drop=True)

# all transactions which weren't orders normal orders, rather some fee payments etc.
def divide_uncommon_transactions(df):
    df_M = df[df['StockCode'] == 'M']
    df_DOT = df[df['StockCode'] == 'DOT']
    df_BANK_CHARGES = df[df['StockCode'] == 'BANK CHARGES']
    df_AMAZONFEE = df[df['StockCode'] == 'AMAZONFEE']
    df_BAD_DEBT = df[df['StockCode'] == "B"]
    df_POSTAGE = df[df['StockCode'] == "POST"]
    df_DISCOUNT = df[df['StockCode'] == "D"]
    df_CRUK_COMMISIONS = df[df['StockCode'] == "CRUK"]

    return {
        'M': df_M,
        'DOT': df_DOT,
        'BANK CHARGES': df_BANK_CHARGES,
        'AMAZONFEE': df_AMAZONFEE,
        'B': df_BAD_DEBT,
        'POST': df_POSTAGE,
        'D': df_DISCOUNT,
        'CRUK': df_CRUK_COMMISIONS
    }

# Call the function and store the results in a dictionary
category_dataframes = divide_uncommon_transactions(df)

# Access each category's DataFrame
df_M = category_dataframes['M']
df_DOT = category_dataframes['DOT']
df_BANK_CHARGES = category_dataframes['BANK CHARGES']
df_AMAZONFEE = category_dataframes['AMAZONFEE']
df_BAD_DEBT = category_dataframes['B']
df_POSTAGE = category_dataframes['POST']
df_DISCOUNT = category_dataframes['D']
df_CRUK_COMMISIONS = category_dataframes['CRUK']
    

def create_filtered_orders(df):
    mask = (df['StockCode'] != "AMAZONFEE") & (df['StockCode'] != "BANK CHARGES") & (df['StockCode'] != "M") & (df['StockCode'] != "DOT") & (df['StockCode'] != "B")& (df['StockCode'] != "POST") & (df['StockCode'] != "D") &(df['StockCode'] != "CRUK")
    filtered_df = df[mask]
    return filtered_df

# checking for all canceled orders (9288 orders)
def cancelled_orders(df):
    mask = df['InvoiceNo'].str.startswith('C')
    cancelled_orders = filtered_df[mask]
    return cancelled_orders
    


filtered_df = create_filtered_orders(df)
filtered_df = filtered_df.reset_index(drop=True)
cancelled_orders_and_returns = cancelled_orders(filtered_df)

In [4]:
cancelled_orders_and_returns[cancelled_orders_and_returns['InvoiceNo']=='C551685']

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [5]:
df_POSTAGE.sort_values(by=['UnitPrice'])

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
52262,540699,POST,POSTAGE,1000,2011-01-11 09:32:00,0.00,0,United Kingdom
326552,565556,POST,POSTAGE,750,2011-09-05 12:14:00,0.00,0,United Kingdom
205991,554857,POST,POSTAGE,800,2011-05-27 10:08:00,0.00,0,United Kingdom
453998,575505,POST,POSTAGE,800,2011-11-10 10:29:00,0.00,0,United Kingdom
431374,573589,POST,POSTAGE,1,2011-10-31 15:11:00,0.55,0,United Kingdom
...,...,...,...,...,...,...,...,...
235379,C557638,POST,POSTAGE,-1,2011-06-21 16:01:00,545.58,17450,United Kingdom
264976,560187,POST,POSTAGE,1,2011-07-15 12:05:00,550.94,17444,Canada
236402,557754,POST,POSTAGE,1,2011-06-22 13:12:00,700.00,12432,Norway
173277,C551685,POST,POSTAGE,-1,2011-05-03 12:51:00,8142.75,16029,United Kingdom


In [28]:
cancelled_orders_and_returns[cancelled_orders_and_returns['CustomerID']==14527]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
148506,C549288,22469,HEART OF WICKER SMALL,-1,2011-04-07 18:06:00,1.65,14527,United Kingdom
148507,C549288,22788,BROCANTE COAT RACK,-1,2011-04-07 18:06:00,9.95,14527,United Kingdom
224637,C556735,22169,FAMILY ALBUM WHITE PICTURE FRAME,-2,2011-06-14 12:01:00,8.5,14527,United Kingdom


In [25]:
df_DISCOUNT

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.50,14527,United Kingdom
9038,C537164,D,Discount,-1,2010-12-05 13:21:00,29.29,14527,United Kingdom
14498,C537597,D,Discount,-1,2010-12-07 12:34:00,281.00,15498,United Kingdom
19392,C537857,D,Discount,-1,2010-12-08 16:00:00,267.12,17340,United Kingdom
31134,C538897,D,Discount,-1,2010-12-15 09:14:00,5.76,16422,United Kingdom
...,...,...,...,...,...,...,...,...
479868,C577227,D,Discount,-1,2011-11-18 12:06:00,19.82,14527,United Kingdom
479869,C577227,D,Discount,-1,2011-11-18 12:06:00,16.76,14527,United Kingdom
493613,C578239,D,Discount,-1,2011-11-23 12:29:00,26.33,14912,Italy
516221,C579884,D,Discount,-1,2011-11-30 17:34:00,20.53,14527,United Kingdom


In [10]:
quantity_stats = filtered_df['Quantity'].describe()
unit_price_stats = filtered_df['UnitPrice'].describe()
print("Quantity Statistics:")
print(quantity_stats)

print("\nUnitPrice Statistics:")
print(unit_price_stats)

Quantity Statistics:
count    539205.000000
mean          9.581654
std         218.573084
min      -80995.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       80995.000000
Name: Quantity, dtype: float64

UnitPrice Statistics:
count    539205.000000
mean          3.287644
std           4.679490
min           0.000000
25%           1.250000
50%           2.080000
75%           4.130000
max         649.500000
Name: UnitPrice, dtype: float64


In [11]:
filtered_df['StockCode'].value_counts()

85123A    2313
22423     2203
85099B    2159
47566     1727
20725     1639
          ... 
85160a       1
62095B       1
85018C       1
84670        1
21653        1
Name: StockCode, Length: 4062, dtype: int64

In [11]:
mask = (df['StockCode'] != "AMAZONFEE") & (df['StockCode'] != "BANK CHARGES") & (df['StockCode'] != "M") & (df['StockCode'] != "DOT") & (df['StockCode'] != "B")& (df['StockCode'] != "POST") & (df['StockCode'] != "D") &(df['StockCode'] != "CRUK")

In [9]:
cancelled_orders_and_returns

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
152,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom
233,C536391,21484,CHICK GREY HOT WATER BOTTLE,-12,2010-12-01 10:24:00,3.45,17548,United Kingdom
234,C536391,21980,PACK OF 12 RED RETROSPOT TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
235,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
...,...,...,...,...,...,...,...,...
537752,C581490,22178,VICTORIAN GLASS HANGING T-LIGHT,-12,2011-12-09 09:57:00,1.95,14397,United Kingdom
537753,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,2011-12-09 09:57:00,0.83,14397,United Kingdom
539014,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,2011-12-09 11:57:00,10.95,15311,United Kingdom
539015,C581569,20979,36 PENCILS TUBE RED RETROSPOT,-5,2011-12-09 11:58:00,1.25,17315,United Kingdom


In [12]:
filtered_df[(filtered_df['InvoiceDate']>'2010-12-01 09:00:00') & (filtered_df['InvoiceDate']<'2010-12-01 09:50:00') & (filtered_df['CustomerID']==15311)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
105,536381,15056BL,EDWARDIAN PARASOL BLACK,2,2010-12-01 09:41:00,5.95,15311,United Kingdom
106,536381,15056N,EDWARDIAN PARASOL NATURAL,2,2010-12-01 09:41:00,5.95,15311,United Kingdom
107,536381,21166,COOK WITH WINE METAL SIGN,1,2010-12-01 09:41:00,1.95,15311,United Kingdom
108,536381,21169,YOU'RE CONFUSING ME METAL SIGN,3,2010-12-01 09:41:00,1.69,15311,United Kingdom
109,536381,21175,GIN + TONIC DIET METAL SIGN,2,2010-12-01 09:41:00,2.1,15311,United Kingdom
110,536381,21523,DOORMAT FANCY FONT HOME SWEET HOME,10,2010-12-01 09:41:00,6.75,15311,United Kingdom
111,536381,21533,RETROSPOT LARGE MILK JUG,1,2010-12-01 09:41:00,4.95,15311,United Kingdom
112,536381,21557,SET OF 6 FUNKY BEAKERS,2,2010-12-01 09:41:00,2.95,15311,United Kingdom
113,536381,21672,WHITE SPOT RED CERAMIC DRAWER KNOB,6,2010-12-01 09:41:00,1.25,15311,United Kingdom
114,536381,21731,RED TOADSTOOL LED NIGHT LIGHT,2,2010-12-01 09:41:00,1.65,15311,United Kingdom


In [61]:
filtered_df[filtered_df['InvoiceNo'].str.startswith('C')==True]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
238,C536391,21980,PACK OF 12 RED RETROSPOT TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
...,...,...,...,...,...,...,...,...
540448,C581490,22178,VICTORIAN GLASS HANGING T-LIGHT,-12,2011-12-09 09:57:00,1.95,14397,United Kingdom
540449,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,2011-12-09 09:57:00,0.83,14397,United Kingdom
541715,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,2011-12-09 11:57:00,10.95,15311,United Kingdom
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,2011-12-09 11:58:00,1.25,17315,United Kingdom


In [16]:
# checking for all canceled orders (9288 orders)
def cancelled_orders(dataframe):
    mask = ['InvoiceNo'].str.startswith('C')
    cancelled_orders = filtered_df[mask]
    

In [None]:
filtered_df

In [46]:
cancelled_orders

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.50,14527,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,2011-12-09 09:57:00,0.83,14397,United Kingdom
541541,C581499,M,Manual,-1,2011-12-09 10:28:00,224.69,15498,United Kingdom
541715,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,2011-12-09 11:57:00,10.95,15311,United Kingdom
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,2011-12-09 11:58:00,1.25,17315,United Kingdom


In [44]:
filtered_df[filtered_df['InvoiceNo']=='C536379']

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [105]:
# countries which buy the most 
filtered_df['Country'].value_counts()

United Kingdom          493982
Germany                   9096
France                    8236
EIRE                      8183
Spain                     2468
Netherlands               2330
Belgium                   1971
Switzerland               1969
Portugal                  1475
Australia                 1257
Norway                    1060
Italy                      783
Channel Islands            756
Finland                    653
Cyprus                     619
Unspecified                446
Sweden                     437
Austria                    387
Denmark                    375
Japan                      355
Poland                     336
Israel                     297
USA                        291
Hong Kong                  280
Singapore                  215
Iceland                    182
Canada                     150
Greece                     142
Malta                      123
United Arab Emirates        67
European Community          58
RSA                         57
Lebanon 