# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


# Libraries and datasets

In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
# load datasets
original_basket_df = pd.read_csv('datasets/baskets_supermarket.csv')
original_categories_df = pd.read_csv('datasets/item_categories.csv')
original_items_df = pd.read_csv('datasets/items.csv')

In [3]:
# global variables for column names
# existing
ITEM = 'item_id'
CAT = 'item_category_id'
DATE = 'date'
SHOP = 'shop_id'
UPRICE = 'item_price'
CNT = 'item_cnt_day'
USER = 'user_id'
BASKET = 'basket_id'
INAME = 'item_name'
CNAME = 'item_category_name'

# new
TMP = 'temp_id'
TPRICE = 'total_price'
IDCNT = 'item_dist_count'
ICNT = 'item_count'

In [4]:
original_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504087 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    504087 non-null  int64  
 1   date          352861 non-null  object 
 2   shop_id       469023 non-null  float64
 3   item_id       405085 non-null  float64
 4   item_price    451795 non-null  float64
 5   item_cnt_day  497039 non-null  float64
 6   user_id       433351 non-null  float64
 7   basket_id     437967 non-null  object 
dtypes: float64(5), int64(1), object(2)
memory usage: 30.8+ MB


In [5]:
original_items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


In [6]:
original_categories_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_category_name  84 non-null     object
 1   item_category_id    84 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


# Data cleaning

## Basket dataset

In [7]:
basket_df = original_basket_df

### Cleaning ID columns

In [8]:
non_floating_attributes = [SHOP, ITEM, CNT, USER]
for col in non_floating_attributes:
    basket_df[col] = basket_df[col].astype('Int64')

basket_df[DATE] = pd.to_datetime(basket_df[DATE])
    
basket_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504087 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    504087 non-null  int64         
 1   date          352861 non-null  datetime64[ns]
 2   shop_id       469023 non-null  Int64         
 3   item_id       405085 non-null  Int64         
 4   item_price    451795 non-null  float64       
 5   item_cnt_day  497039 non-null  Int64         
 6   user_id       433351 non-null  Int64         
 7   basket_id     437967 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 32.7+ MB


#### Subset with basket_id

In [9]:
stored_basket_df = basket_df[basket_df[BASKET].notna()]
stored_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437967 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    437967 non-null  int64         
 1   date          306631 non-null  datetime64[ns]
 2   shop_id       407650 non-null  Int64         
 3   item_id       352049 non-null  Int64         
 4   item_price    392583 non-null  float64       
 5   item_cnt_day  431863 non-null  Int64         
 6   user_id       376422 non-null  Int64         
 7   basket_id     437967 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 31.7+ MB


In [10]:
def integrate(int_attr, gr_attr, df):
    df[int_attr] = df.groupby(gr_attr)[int_attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))

In [11]:
basket_dependent_attributes = [DATE, SHOP, USER]
for attr in basket_dependent_attributes:
    integrate(attr, BASKET, stored_basket_df)

stored_basket_df = stored_basket_df.dropna(subset=basket_dependent_attributes, thresh=2)
    
stored_basket_df.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[int_attr] = df.groupby(gr_attr)[int_attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[int_attr] = df.groupby(gr_attr)[int_attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 437708 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    437708 non-null  int64         
 1   date          435787 non-null  datetime64[ns]
 2   shop_id       437524 non-null  Int64         
 3   item_id       351844 non-null  Int64         
 4   item_price    392353 non-null  float64       
 5   item_cnt_day  431606 non-null  Int64         
 6   user_id       437212 non-null  Int64         
 7   basket_id     437708 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 31.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[int_attr] = df.groupby(gr_attr)[int_attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))


#### Subset without basket_id

In [12]:
null_basket_df = basket_df[basket_df[BASKET].isna()]

null_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66120 entries, 20 to 504077
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    66120 non-null  int64         
 1   date          46230 non-null  datetime64[ns]
 2   shop_id       61373 non-null  Int64         
 3   item_id       53036 non-null  Int64         
 4   item_price    59212 non-null  float64       
 5   item_cnt_day  65176 non-null  Int64         
 6   user_id       56929 non-null  Int64         
 7   basket_id     0 non-null      object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 4.8+ MB


In [13]:
null_basket_df = null_basket_df.dropna(subset=basket_dependent_attributes)
null_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36987 entries, 61 to 504077
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    36987 non-null  int64         
 1   date          36987 non-null  datetime64[ns]
 2   shop_id       36987 non-null  Int64         
 3   item_id       29726 non-null  Int64         
 4   item_price    33150 non-null  float64       
 5   item_cnt_day  36454 non-null  Int64         
 6   user_id       36987 non-null  Int64         
 7   basket_id     0 non-null      object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 2.7+ MB


In [14]:
null_basket_df[TMP] = null_basket_df.groupby(basket_dependent_attributes).ngroup().astype(int)
null_basket_df[TMP]

61           0
67           0
105          1
107          1
109          1
          ... 
504036    6737
504038    6737
504051    6737
504063    6736
504077    6738
Name: temp_id, Length: 36987, dtype: int64

In [15]:
new_basket_df = null_basket_df[null_basket_df.duplicated(subset=[TMP], keep=False)]
new_basket_df[BASKET] = 'R' + null_basket_df[TMP].astype(str)
new_basket_df = new_basket_df.drop(TMP, 1)
new_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27748 entries, 61 to 504051
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    27748 non-null  int64         
 1   date          27748 non-null  datetime64[ns]
 2   shop_id       27748 non-null  Int64         
 3   item_id       22323 non-null  Int64         
 4   item_price    24897 non-null  float64       
 5   item_cnt_day  27342 non-null  Int64         
 6   user_id       27748 non-null  Int64         
 7   basket_id     27748 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 2.0+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_basket_df[BASKET] = 'R' + null_basket_df[TMP].astype(str)
  new_basket_df = new_basket_df.drop(TMP, 1)


In [16]:
single_basket_df = null_basket_df.drop_duplicates(subset=[TMP], keep=False)
single_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9239 entries, 121 to 504077
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    9239 non-null   int64         
 1   date          9239 non-null   datetime64[ns]
 2   shop_id       9239 non-null   Int64         
 3   item_id       7403 non-null   Int64         
 4   item_price    8253 non-null   float64       
 5   item_cnt_day  9112 non-null   Int64         
 6   user_id       9239 non-null   Int64         
 7   basket_id     0 non-null      object        
 8   temp_id       9239 non-null   int64         
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 757.9+ KB


In [17]:
restoring_basket_df = pd.concat([stored_basket_df, single_basket_df])
restoring_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 446947 entries, 0 to 504077
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    446947 non-null  int64         
 1   date          445026 non-null  datetime64[ns]
 2   shop_id       446763 non-null  Int64         
 3   item_id       359247 non-null  Int64         
 4   item_price    400606 non-null  float64       
 5   item_cnt_day  440718 non-null  Int64         
 6   user_id       446451 non-null  Int64         
 7   basket_id     437708 non-null  object        
 8   temp_id       9239 non-null    float64       
dtypes: Int64(4), datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 35.8+ MB


In [18]:
integrate(BASKET, basket_dependent_attributes, restoring_basket_df)
restoring_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 446947 entries, 0 to 504077
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    446947 non-null  int64         
 1   date          445026 non-null  datetime64[ns]
 2   shop_id       446763 non-null  Int64         
 3   item_id       359247 non-null  Int64         
 4   item_price    400606 non-null  float64       
 5   item_cnt_day  440718 non-null  Int64         
 6   user_id       446451 non-null  Int64         
 7   basket_id     443752 non-null  object        
 8   temp_id       9239 non-null    float64       
dtypes: Int64(4), datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 35.8+ MB


In [19]:
restoring_basket_df = restoring_basket_df.dropna(subset=[BASKET, DATE, SHOP, USER], thresh=3)
restoring_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444346 entries, 0 to 504077
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    444346 non-null  int64         
 1   date          444346 non-null  datetime64[ns]
 2   shop_id       444346 non-null  Int64         
 3   item_id       357166 non-null  Int64         
 4   item_price    398288 non-null  float64       
 5   item_cnt_day  438157 non-null  Int64         
 6   user_id       444346 non-null  Int64         
 7   basket_id     443752 non-null  object        
 8   temp_id       9239 non-null    float64       
dtypes: Int64(4), datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 35.6+ MB


In [20]:
restoring_basket_df[BASKET] = restoring_basket_df[BASKET].fillna('S' + restoring_basket_df[TMP].astype(str))
restoring_basket_df = restoring_basket_df.drop(TMP, 1)
restoring_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444346 entries, 0 to 504077
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    444346 non-null  int64         
 1   date          444346 non-null  datetime64[ns]
 2   shop_id       444346 non-null  Int64         
 3   item_id       357166 non-null  Int64         
 4   item_price    398288 non-null  float64       
 5   item_cnt_day  438157 non-null  Int64         
 6   user_id       444346 non-null  Int64         
 7   basket_id     444346 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 32.2+ MB


  restoring_basket_df = restoring_basket_df.drop(TMP, 1)


#### Merging subsets

In [21]:
clean_basket_df = pd.concat([restoring_basket_df, new_basket_df])
clean_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 472094 entries, 0 to 504051
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    472094 non-null  int64         
 1   date          472094 non-null  datetime64[ns]
 2   shop_id       472094 non-null  Int64         
 3   item_id       379489 non-null  Int64         
 4   item_price    423185 non-null  float64       
 5   item_cnt_day  465499 non-null  Int64         
 6   user_id       472094 non-null  Int64         
 7   basket_id     472094 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 34.2+ MB


### Cleaning item columns

In [22]:
clean_basket_df = clean_basket_df.dropna(subset=CNT)
clean_basket_df = clean_basket_df[clean_basket_df[CNT] > 0]
clean_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 464180 entries, 0 to 504051
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    464180 non-null  int64         
 1   date          464180 non-null  datetime64[ns]
 2   shop_id       464180 non-null  Int64         
 3   item_id       373163 non-null  Int64         
 4   item_price    416080 non-null  float64       
 5   item_cnt_day  464180 non-null  Int64         
 6   user_id       464180 non-null  Int64         
 7   basket_id     464180 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 33.6+ MB


In [23]:
clean_basket_df = clean_basket_df.dropna(subset=[ITEM, UPRICE], thresh=1)
clean_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454723 entries, 0 to 504051
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    454723 non-null  int64         
 1   date          454723 non-null  datetime64[ns]
 2   shop_id       454723 non-null  Int64         
 3   item_id       373163 non-null  Int64         
 4   item_price    416080 non-null  float64       
 5   item_cnt_day  454723 non-null  Int64         
 6   user_id       454723 non-null  Int64         
 7   basket_id     454723 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 33.0+ MB


In [24]:
s = clean_basket_df.loc[clean_basket_df[ITEM].isna(), ITEM].fillna(-clean_basket_df[ITEM].isna().cumsum())
clean_basket_df[ITEM] = clean_basket_df[ITEM].fillna(s)
clean_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454723 entries, 0 to 504051
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    454723 non-null  int64         
 1   date          454723 non-null  datetime64[ns]
 2   shop_id       454723 non-null  Int64         
 3   item_id       454723 non-null  Int64         
 4   item_price    416080 non-null  float64       
 5   item_cnt_day  454723 non-null  Int64         
 6   user_id       454723 non-null  Int64         
 7   basket_id     454723 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 49.1+ MB


In [25]:
temp = clean_basket_df.groupby(ITEM)[UPRICE].mean()
priced_items_df = pd.merge(original_items_df, temp, on=ITEM, how='outer')
priced_items_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103730 entries, 0 to 103729
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   item_name         22170 non-null   object 
 1   item_id           103730 non-null  Int64  
 2   item_category_id  22170 non-null   float64
 3   item_price        98373 non-null   float64
dtypes: Int64(1), float64(2), object(1)
memory usage: 4.1+ MB


In [26]:
clean_basket_df = clean_basket_df.set_index(ITEM).join(priced_items_df.set_index(ITEM), lsuffix='', rsuffix='_temp').reset_index()
clean_basket_df[UPRICE] = clean_basket_df[UPRICE].fillna(clean_basket_df['item_price_temp'])
clean_basket_df = clean_basket_df.drop([INAME, CAT, 'item_price_temp'], axis=1)
clean_basket_df = clean_basket_df.dropna()
clean_basket_df = clean_basket_df.drop('Unnamed: 0', axis=1)

clean_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454412 entries, 0 to 454722
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   item_id       454412 non-null  Int64         
 1   date          454412 non-null  datetime64[ns]
 2   shop_id       454412 non-null  Int64         
 3   item_price    454412 non-null  float64       
 4   item_cnt_day  454412 non-null  Int64         
 5   user_id       454412 non-null  Int64         
 6   basket_id     454412 non-null  object        
dtypes: Int64(4), datetime64[ns](1), float64(1), object(1)
memory usage: 29.5+ MB


### Baskets have unique entry for each item

In [29]:
basket_item_count = clean_basket_df[clean_basket_df[ITEM] > 0].groupby([BASKET, ITEM]).size()
basket_item_count[basket_item_count > 1]

basket_id  item_id
R2533      16581      2
dtype: int64

In [30]:
id = basket_item_count[basket_item_count > 1].index
index = clean_basket_df[(clean_basket_df[BASKET] == id[0][0]) & (clean_basket_df[ITEM] == id[0][1])].index
dropIndex = index[0]
keepIndex = index[1]

In [31]:
# add qty of drop to the keeping row and drop the other row
clean_basket_df[QTY][keepIndex] += clean_basket_df[QTY][dropIndex]
clean_basket_df.drop(axis=0, labels=dropIndex, inplace=True)

In [34]:
basket_item_count = clean_basket_df[clean_basket_df[ITEM] > 0].groupby([BASKET, ITEM]).size()
basket_item_count[basket_item_count > 1].empty

True

# Data visualization

In [35]:
total_df = clean_basket_df

### Basket number

In [36]:
b_df = clean_basket_df.groupby([BASKET])[BASKET]
basketsCount = len(b_df)
basketsCount

42252

### Baskets have same shop, user and date

In [37]:
# data integrity
basketShopCount = len(clean_basket_df.groupby([BASKET, SHOP])[[BASKET, SHOP]])
basketUserCount = len(clean_basket_df.groupby([BASKET, USER])[[BASKET, USER]])
basketDateCount = len(clean_basket_df.groupby([BASKET, DATE])[[BASKET, DATE]])

if (basketShopCount == basketsCount and basketUserCount == basketsCount and basketDateCount == basketsCount):
    print(True)

True


## Data visualization

In [542]:
total_df = clean_basket_df

### Record total price

In [38]:
# total price
total_df[TPRICE] = total_df[UPRICE] * total_df[CNT]
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454411 entries, 0 to 454722
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   item_id       454411 non-null  Int64         
 1   date          454411 non-null  datetime64[ns]
 2   shop_id       454411 non-null  Int64         
 3   item_price    454411 non-null  float64       
 4   item_cnt_day  454411 non-null  Int64         
 5   user_id       454411 non-null  Int64         
 6   basket_id     454411 non-null  object        
 7   total_price   454411 non-null  Float64       
dtypes: Float64(1), Int64(4), datetime64[ns](1), float64(1), object(1)
memory usage: 33.4+ MB


### Domains

In [39]:
total_df.min()

item_id                      -81560
date            2013-01-01 00:00:00
shop_id                           0
item_price                      0.1
item_cnt_day                      1
user_id                           1
basket_id                       B11
total_price                     0.1
dtype: object

In [None]:
total_df.mean()

In [None]:
total_df.max()

### Basket table

In [None]:
# make tab for basket (basket price, user, date, shop, basket quantity)
b_df = pd.DataFrame()
b_df[BASKET] = total_df.groupby([BASKET])[[BASKET]].first()
b_df[DATE] = total_df.groupby([BASKET])[[DATE]].first()
b_df[USER] = total_df.groupby([BASKET])[[USER]].first().astype(int)
b_df[SHOP] = total_df.groupby([BASKET])[[SHOP]].first().astype(int)
b_df[TPRICE] = total_df.groupby([BASKET])[[TPRICE]].sum().astype(float)
b_df[ICNT] = total_df.groupby([BASKET])[[CNT]].sum().astype(int)
b_df[IDCNT] = total_df.groupby([BASKET])[[ITEM]].nunique()
b_df.info()

In [None]:
b_df[TPRICE].sort_values().plot()

In [None]:
b_df[ICNT].sort_values().plot()

In [None]:
b_df[IDCNT].sort_values().plot()

### User table

In [None]:
# make tab for user (total spent, item bought, average basket price, basket bought)
u_df = pd.DataFrame()
u_df[USER] = total_df.groupby([USER])[[USER]].first().astype(int)
u_df[TPRICE] = total_df.groupby([USER])[[TPRICE]].sum().astype(float)
u_df[ICNT] = total_df.groupby([USER])[[CNT]].sum().astype(int)
u_df[IDCNT] = total_df.groupby([USER])[[ITEM]].nunique()
u_df.info()

In [None]:
u_df[TPRICE].sort_values().plot(kind='bar')

In [None]:
u_df[BCNT].sort_values().plot(kind='bar')

In [None]:
u_df[BDCNT].sort_values().plot(kind='bar')

In [None]:
# make tab for shop (item sold, average price, total sold)
s_df = pd.DataFrame()
s_df[SHOP] = total_df.groupby([SHOP])[[SHOP]].first().astype(int)
s_df[TPRICE] = total_df.groupby([SHOP])[[TPRICE]].sum().astype(float)
s_df[ICNT] = total_df.groupby([SHOP])[[CNT]].sum().astype(int)
s_df[IDCNT] = total_df.groupby([SHOP])[[ITEM]].nunique()
s_df.info()

In [None]:
s_df[TPRICE].sort_values().plot(kind='bar')

In [None]:
s_df[ICNT].sort_values().plot(kind='bar')

In [None]:
s_df[IDCNT].sort_values().plot(kind='bar')

In [None]:
# make time series to see spendings over time
d_df = pd.DataFrame(index=total_df[DATE].unique())
d_df[TPRICE] = total_df.groupby([DATE])[[TPRICE]].sum().astype(float)
d_df[ICNT] = total_df.groupby([DATE])[[CNT]].sum().astype(int)
d_df[IDCNT] = total_df.groupby([DATE])[[ITEM]].count()
d_df.info()


In [None]:
d_df[IDCNT].plot()


In [None]:
d_df[TPRICE].plot()

In [None]:
d_df[ICNT].plot()

## Spazio query marce

In [None]:
# 170 - triplette con basket id diversi
stored_basket_df.groupby(basket_dependent_attributes)[BASKET].nunique().sort_values()[-170:]

In [None]:
# 3307 - ordini con una sola riga
stored_basket_df.groupby('basket_id').size().sort_values()[:3310]

In [None]:
restoring_basket_df[BASKET].sort_values()

date  date
1     2013    31012
      2014    27752
      2015    16093
2     2013    36189
      2014    28393
      2015    16199
3     2013    34436
      2014    25776
      2015    17327
4     2013    29637
      2014    26710
      2015    17698
5     2013    31019
      2014    27442
      2015    16201
6     2013    32313
      2014    23816
      2015    16398
Name: date, dtype: int64

## Spazio grafici

In [None]:
clean_basket_df['shop_id'].value_counts().plot(kind='bar')

In [None]:
clean_basket_df.plot(x='item_price', y='item_cnt_day', kind='scatter')

In [None]:
clean_basket_df[BASKET].sort_values()

## Spazio cazzeggio

In [None]:
df = pd.DataFrame([[4,9],[4,3],[4,9],[4,3],[4, np.nan],[3,1],[3,np.nan],[2,np.nan],[np.nan,2], [np.nan,3]], columns=['A', 'B'])

In [None]:
df

In [None]:
df['B'] = df.groupby('A')['B'].apply(lambda x: x.fillna(x.mode().iloc[0]) if len(x.mode()) == 1 else x)

In [None]:
df

In [None]:
df['B'] = df.groupby('A')['B'].apply(lambda x: x.replace(0 if len(x.mode()) > 1 else x))

In [None]:
# df.loc[df.groupby('A')['B'].agg(lambda x: len(x.mode())) > 1, 'B'] = 0


In [None]:
b_df

Unnamed: 0_level_0,basket_id,date,user_id,shop_id,total_price,item_count,item_dist_count
basket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
B11,B11,2013-01-01,1,2,4015.580000,5,5
B12103,B12103,2014-01-01,103,28,44969.813416,50,35
B12107,B12107,2014-01-01,107,30,948.000000,2,2
B1211,B1211,2013-01-01,11,8,3911.000000,9,8
B12111,B12111,2014-01-01,111,35,4144.194268,6,5
...,...,...,...,...,...,...,...
S9916.0,S9916.0,2014-03-12,205,18,666.160000,1,1
S9949.0,S9949.0,2014-03-12,501,34,849.000000,1,1
S9950.0,S9950.0,2014-03-12,54,35,699.000000,1,1
S9964.0,S9964.0,2014-03-12,464,44,899.000000,1,1


In [None]:
total_df[total_df[BASKET] == 'B12103']

Unnamed: 0,item_id,date,shop_id,item_price,item_cnt_day,user_id,basket_id,total_price
328,-1,2014-01-01,28,199.0,1,103,B12103,199.0
329,-1,2014-01-01,28,199.0,1,103,B12103,199.0
330,-1,2014-01-01,28,249.0,1,103,B12103,249.0
331,-1,2014-01-01,28,599.0,2,103,B12103,1198.0
332,-1,2014-01-01,28,349.0,1,103,B12103,349.0
333,-1,2014-01-01,28,299.0,2,103,B12103,598.0
334,-1,2014-01-01,28,899.0,3,103,B12103,2697.0
335,-1,2014-01-01,28,399.0,1,103,B12103,399.0
336,-1,2014-01-01,28,349.5,1,103,B12103,349.5
83545,385,2014-01-01,28,249.583333,1,103,B12103,249.583333


### TDL

- basket id has trailing '.0' (?)

- real distinct item (id) in minor table