# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [14]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

In [15]:
# load datasets
basket_df = pd.read_csv('datasets/baskets_supermarket.csv')
categories_df = pd.read_csv('datasets/item_categories.csv')
items_df = pd.read_csv('datasets/items.csv')

## Basket dataset

In [16]:
basket_df.head()

Unnamed: 0.1,Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day,user_id,basket_id
0,32632,,2.0,,249.0,1.0,1.0,B11
1,33487,01.01.2013,2.0,20424.0,,1.0,1.0,B11
2,32099,01.01.2013,2.0,6916.0,999.5,1.0,1.0,B11
3,33529,01.01.2013,2.0,19718.0,149.0,1.0,1.0,B11
4,31974,,2.0,11828.0,1699.0,1.0,1.0,B11


In [17]:
basket_df.dtypes

Unnamed: 0        int64
date             object
shop_id         float64
item_id         float64
item_price      float64
item_cnt_day    float64
user_id         float64
basket_id        object
dtype: object

In [18]:
basket_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504087 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    504087 non-null  int64  
 1   date          352861 non-null  object 
 2   shop_id       469023 non-null  float64
 3   item_id       405085 non-null  float64
 4   item_price    451795 non-null  float64
 5   item_cnt_day  497039 non-null  float64
 6   user_id       433351 non-null  float64
 7   basket_id     437967 non-null  object 
dtypes: float64(5), int64(1), object(2)
memory usage: 30.8+ MB


### Cleaning data

In [19]:
non_floating_attributes = ['shop_id', 'item_id', 'item_cnt_day', 'user_id']
for col in non_floating_attributes:
    basket_df[col] = basket_df[col].astype('Int64')

basket_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504087 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    504087 non-null  int64  
 1   date          352861 non-null  object 
 2   shop_id       469023 non-null  Int64  
 3   item_id       405085 non-null  Int64  
 4   item_price    451795 non-null  float64
 5   item_cnt_day  497039 non-null  Int64  
 6   user_id       433351 non-null  Int64  
 7   basket_id     437967 non-null  object 
dtypes: Int64(4), float64(1), int64(1), object(2)
memory usage: 32.7+ MB


In [20]:
null_basket_df = basket_df[basket_df['basket_id'].isna()]
null_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66120 entries, 20 to 504077
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    66120 non-null  int64  
 1   date          46230 non-null  object 
 2   shop_id       61373 non-null  Int64  
 3   item_id       53036 non-null  Int64  
 4   item_price    59212 non-null  float64
 5   item_cnt_day  65176 non-null  Int64  
 6   user_id       56929 non-null  Int64  
 7   basket_id     0 non-null      object 
dtypes: Int64(4), float64(1), int64(1), object(2)
memory usage: 4.8+ MB


In [21]:
stored_basket_df = basket_df[basket_df['basket_id'].notna()]
stored_basket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437967 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    437967 non-null  int64  
 1   date          306631 non-null  object 
 2   shop_id       407650 non-null  Int64  
 3   item_id       352049 non-null  Int64  
 4   item_price    392583 non-null  float64
 5   item_cnt_day  431863 non-null  Int64  
 6   user_id       376422 non-null  Int64  
 7   basket_id     437967 non-null  object 
dtypes: Int64(4), float64(1), int64(1), object(2)
memory usage: 31.7+ MB


In [105]:
basket_dependent_attributes = ['date', 'shop_id', 'user_id']
for attr in basket_dependent_attributes:
    stored_basket_df[attr] = stored_basket_df.groupby('basket_id')[attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))

stored_basket_df.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stored_basket_df[attr] = stored_basket_df.groupby('basket_id')[attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stored_basket_df[attr] = stored_basket_df.groupby('basket_id')[attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 437967 entries, 0 to 504086
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    437967 non-null  int64  
 1   date          435812 non-null  object 
 2   shop_id       437690 non-null  Int64  
 3   item_id       352049 non-null  Int64  
 4   item_price    392583 non-null  float64
 5   item_cnt_day  431863 non-null  Int64  
 6   user_id       437269 non-null  Int64  
 7   basket_id     437967 non-null  object 
dtypes: Int64(4), float64(1), int64(1), object(2)
memory usage: 31.7+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stored_basket_df[attr] = stored_basket_df.groupby('basket_id')[attr].apply(lambda x: x.fillna(x.mode().iloc[0] if len(x.mode()) == 1 else x))


## Other datasets

In [10]:
categories_df.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [11]:
items_df.head()
items_df.dtypes

item_name           object
item_id              int64
item_category_id     int64
dtype: object

In [12]:
basket_df = basket_df.join(items_df)
basket_df.head()

ValueError: columns overlap but no suffix specified: Index(['item_id'], dtype='object')

In [100]:
df = pd.DataFrame([[4,9],[4,3],[4,9],[4,3],[4, np.nan],[3,1],[3,np.nan],[2,np.nan]], columns=['A', 'B'])

In [101]:
df

Unnamed: 0,A,B
0,4,9.0
1,4,3.0
2,4,9.0
3,4,3.0
4,4,
5,3,1.0
6,3,
7,2,


In [102]:
df['B'] = df.groupby('A')['B'].apply(lambda x: x.fillna(x.mode().iloc[0]) if len(x.mode()) == 1 else x)

In [103]:
df

Unnamed: 0,A,B
0,4,9.0
1,4,3.0
2,4,9.0
3,4,3.0
4,4,
5,3,1.0
6,3,1.0
7,2,


In [96]:
df['B'] = df.groupby('A')['B'].apply(lambda x: x.fillna(x.mode().iloc[0]) if len(x.mode()) == 1 else x)

In [97]:
df


Unnamed: 0,A,B
0,4,
1,4,
2,4,
3,4,"5 1.0 6 1.0 Name: 3, dtype: float64"
4,4,
5,3,
6,3,
7,2,
