# Прогнозирование продаж. Часть 1. Подготовка данных

In [None]:
import numpy as np
import pandas as pd
import copy

### Подготовка датасета для анализа данных

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/ecommerce_data.csv')

In [None]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [None]:
# Приводим названия столбцов датасета к нижнему регистру
df.columns = [_.lower() for _ in df.columns.values]

In [None]:
# Трансформируем строку-дату в правильный формат и избавляемся от времени
df['invoicedate'] = pd.to_datetime(df['invoicedate'], format='%m/%d/%Y %H:%M')
df['invoicedate'] = pd.to_datetime(df['invoicedate']).dt.normalize()

In [None]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoiceno    541909 non-null  object        
 1   stockcode    541909 non-null  object        
 2   description  540455 non-null  object        
 3   quantity     541909 non-null  int64         
 4   invoicedate  541909 non-null  datetime64[ns]
 5   unitprice    541909 non-null  float64       
 6   customerid   406829 non-null  float64       
 7   country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [None]:
# Удаляем строки с пропусками и возвратами
df = df.loc[(~df['description'].isnull()) &
                       (~df['customerid'].isnull()) &
                       (~df['invoiceno'].str.contains('C', case=False))]

In [None]:
# Назначаем всем числовым столбцам правильные форматы
convert_dict = {'invoiceno': int, 'customerid': int, 'quantity': int, 'unitprice': float}
df = df.astype(convert_dict)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397924 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoiceno    397924 non-null  int64         
 1   stockcode    397924 non-null  object        
 2   description  397924 non-null  object        
 3   quantity     397924 non-null  int64         
 4   invoicedate  397924 non-null  datetime64[ns]
 5   unitprice    397924 non-null  float64       
 6   customerid   397924 non-null  int64         
 7   country      397924 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 27.3+ MB


In [None]:
# Рассчитаем сумма покупки по каждой записи в базе, а затем удаляем столбцы - количество и цена
df['amount'] = df['quantity'] * df['unitprice']
df.drop(['quantity','unitprice'],axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,invoicedate,customerid,country,amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,2010-12-01,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,2010-12-01,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01,17850,United Kingdom,20.34


In [None]:
# Определяем наличие товаров, которые имеют один и тот же код, но разное описание
df_stockcode = df.groupby(['stockcode'])['description'].nunique()

In [None]:
df_stockcode = df_stockcode.to_frame().reset_index().sort_values(['description'])

In [None]:
df_stockcode.tail()

Unnamed: 0,stockcode,description
2070,23240,3
2074,23244,3
1957,23126,3
2066,23236,4
2026,23196,4


In [None]:
# "Стандартизируем" описание товаров согласно следующей логике: первое значение становится единым для всего датасета с данными.
# В данном подходе есть слабый момент, правильным может быть описание не в первой строке. 
# Формируем справочник с "корректными" значениями описания товара
df_stockcode_first_description = df.groupby(['stockcode'])['description'].first()

In [None]:
df_stockcode_first_description = df_stockcode_first_description.to_frame().reset_index()

In [None]:
df_stockcode_first_description.head()

Unnamed: 0,stockcode,description
0,10002,INFLATABLE POLITICAL GLOBE
1,10080,GROOVY CACTUS INFLATABLE
2,10120,DOGGY RUBBER
3,10123C,HEARTS WRAPPING TAPE
4,10124A,SPOTS ON RED BOOKCOVER TAPE


In [None]:
df = df.merge(df_stockcode_first_description, 
              how='left', 
              left_on='stockcode', 
              right_on='stockcode',
              suffixes=('_left', '_right'))

In [None]:
df.head()

Unnamed: 0,invoiceno,stockcode,description_left,invoicedate,customerid,country,amount,description_right
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2010-12-01,17850,United Kingdom,15.3,WHITE HANGING HEART T-LIGHT HOLDER
1,536365,71053,WHITE METAL LANTERN,2010-12-01,17850,United Kingdom,20.34,WHITE METAL LANTERN
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,2010-12-01,17850,United Kingdom,22.0,CREAM CUPID HEARTS COAT HANGER
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,2010-12-01,17850,United Kingdom,20.34,KNITTED UNION FLAG HOT WATER BOTTLE
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,2010-12-01,17850,United Kingdom,20.34,RED WOOLLY HOTTIE WHITE HEART.


In [None]:
df.drop(['description_left'],axis=1, inplace=True)
df = df.rename(columns={'description_right':'description'})

In [None]:
df.head()

Unnamed: 0,invoiceno,stockcode,invoicedate,customerid,country,amount,description
0,536365,85123A,2010-12-01,17850,United Kingdom,15.3,WHITE HANGING HEART T-LIGHT HOLDER
1,536365,71053,2010-12-01,17850,United Kingdom,20.34,WHITE METAL LANTERN
2,536365,84406B,2010-12-01,17850,United Kingdom,22.0,CREAM CUPID HEARTS COAT HANGER
3,536365,84029G,2010-12-01,17850,United Kingdom,20.34,KNITTED UNION FLAG HOT WATER BOTTLE
4,536365,84029E,2010-12-01,17850,United Kingdom,20.34,RED WOOLLY HOTTIE WHITE HEART.


In [None]:
# Проверяем насколько корректно отработала операция замены
df_stockcode = df.groupby(['stockcode'])['description'].nunique()

In [None]:
df_stockcode = df_stockcode.to_frame().reset_index().sort_values(['description'])

In [None]:
df_stockcode.tail()

Unnamed: 0,stockcode,description
1225,22350,1
1226,22351,1
1227,22352,1
1229,22354,1
3664,POST,1


In [None]:
df.shape

(397924, 7)

In [None]:
df = df.groupby(['invoiceno', 'stockcode', 'invoicedate', 'customerid', 'country','description'])['amount'].sum()

In [None]:
df = df.to_frame().reset_index()

In [None]:
df.shape

(387875, 7)

In [None]:
result = copy.deepcopy(df)

In [None]:
result.head()

Unnamed: 0,invoiceno,stockcode,invoicedate,customerid,country,description,amount
0,536365,21730,2010-12-01,17850,United Kingdom,GLASS STAR FROSTED T-LIGHT HOLDER,25.5
1,536365,22752,2010-12-01,17850,United Kingdom,SET 7 BABUSHKA NESTING BOXES,15.3
2,536365,71053,2010-12-01,17850,United Kingdom,WHITE METAL LANTERN,20.34
3,536365,84029E,2010-12-01,17850,United Kingdom,RED WOOLLY HOTTIE WHITE HEART.,20.34
4,536365,84029G,2010-12-01,17850,United Kingdom,KNITTED UNION FLAG HOT WATER BOTTLE,20.34


In [None]:
result.to_csv('ecommerce_data_result.csv',index=False,sep=',')