In [1]:
import bz2

In [2]:
import pickle

In [3]:
import pandas as pd

In [4]:
sales = pd.DataFrame()

In [5]:
archived = bz2.BZ2File("walmart_sales.pkl.bz2", "r")

In [6]:
sales = pickle.load(archived)

In [7]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413119 entries, 0 to 413118
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   store                 413119 non-null  int64         
 1   type                  413119 non-null  object        
 2   department            413119 non-null  int32         
 3   date                  413119 non-null  datetime64[ns]
 4   weekly_sales          413119 non-null  float64       
 5   is_holiday            413119 non-null  bool          
 6   temperature_c         413119 non-null  float64       
 7   fuel_price_usd_per_l  413119 non-null  float64       
 8   unemployment          413119 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(4), int32(1), int64(1), object(1)
memory usage: 27.2+ MB


## Mean and median
Summary statistics are exactly what they sound like - they summarize many numbers in one statistic. For example, mean, median, minimum, maximum, and standard deviation are summary statistics. Calculating summary statistics allows you to get a better sense of your daExplore your new DataFrame first by printing the first few rows of the sales DataFrame.

- Print information about the columns in sales.
- Print the mean of the weekly_sales column.
- Print the median of the weekly_sales column.ta, even if there's a lot of it.



In [8]:
# Print the head of the sales DataFrame
print(sales.head())

# Print the info about the sales DataFrame
print(sales.info())

# Print the mean of weekly_sales
print(sales.weekly_sales.mean())

# Print the median of weekly_sales
print(sales.weekly_sales.median())

   store type  department       date  weekly_sales  is_holiday  temperature_c  \
0      1    A           1 2010-02-05      24924.50       False       5.727778   
1      1    A           2 2010-02-05      50605.27       False       5.727778   
2      1    A           3 2010-02-05      13740.12       False       5.727778   
3      1    A           4 2010-02-05      39954.04       False       5.727778   
4      1    A           5 2010-02-05      32229.38       False       5.727778   

   fuel_price_usd_per_l  unemployment  
0              0.679451         8.106  
1              0.679451         8.106  
2              0.679451         8.106  
3              0.679451         8.106  
4              0.679451         8.106  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 413119 entries, 0 to 413118
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   store                 413119 non-null  i

## Summarizing dates
Summary statistics can also be calculated on date columns which have values with the data type datetime64. Some summary statistics — like mean — don't make a ton of sense on dates, but others are super helpful, for example minimum and maximum, which allow you to see what time range your data covers.

- Print the maximum of the date column.
- Print the minimum of the date column.

In [9]:
# Print the maximum of the date column
print(sales.date.max())

# Print the minimum of the date column
print(sales.date.min())

2012-10-26 00:00:00
2010-02-05 00:00:00


In [10]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales["temperature_c"].agg(iqr))

15.299999999999994


In [11]:
# A custom IQR function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l","unemployment"]].agg(iqr))

temperature_c           15.300000
fuel_price_usd_per_l     0.211866
unemployment             1.672000
dtype: float64


In [12]:
# Import NumPy and create custom IQR function
import numpy as np
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))

        temperature_c  fuel_price_usd_per_l  unemployment
iqr             15.30              0.211866         1.672
median          16.75              0.911922         7.852


## Cumulative statistics
Cumulative statistics can also be helpful in tracking summary statistics over time. In this exercise, you'll calculate the cumulative sum and cumulative max of a department's weekly sales, which will allow you to identify what the total sales were so far as well as what the highest weekly sales were so far.

A DataFrame called sales_1_1 has been created for you, which contains the sales data for department 1 of store 1. pandas is loaded as pd.

- Sort the rows of sales_1_1 by the date column in ascending order.
- Get the cumulative sum of weekly_sales and add it as a new column of sales_1_1 called cum_weekly_sales.
- Get the cumulative maximum of weekly_sales, and add it as a column called cum_max_sales.
- Print the date, weekly_sales, cum_weekly_sales, and cum_max_sales columns.

In [15]:
sales_1_1 = sales[(sales.store==1)&(sales.department==1)]

In [16]:
sales_1_1.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
73,1,A,1,2010-02-12,46039.49,True,3.616667,0.673111,8.106
145,1,A,1,2010-02-19,41595.55,False,4.405556,0.664129,8.106
218,1,A,1,2010-02-26,19403.54,False,8.127778,0.676545,8.106
290,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106


In [17]:
# Sort sales_1_1 by date
sales_1_1 = sales_1_1.sort_values("date",ascending = True) 

In [19]:
# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()

In [21]:
sales_1_1.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment,cum_weekly_sales
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106,24924.5
73,1,A,1,2010-02-12,46039.49,True,3.616667,0.673111,8.106,70963.99
145,1,A,1,2010-02-19,41595.55,False,4.405556,0.664129,8.106,112559.54
218,1,A,1,2010-02-26,19403.54,False,8.127778,0.676545,8.106,131963.08
290,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106,153790.98


In [22]:
# Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()


In [24]:
# See the columns you calculated
sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]]

Unnamed: 0,date,weekly_sales,cum_weekly_sales,cum_max_sales
0,2010-02-05,24924.50,24924.50,24924.50
73,2010-02-12,46039.49,70963.99,46039.49
145,2010-02-19,41595.55,112559.54,46039.49
218,2010-02-26,19403.54,131963.08,46039.49
290,2010-03-05,21827.90,153790.98,46039.49
...,...,...,...,...
9883,2012-09-28,18947.81,3123160.62,57592.12
9956,2012-10-05,21904.47,3145065.09,57592.12
10028,2012-10-12,22764.01,3167829.10,57592.12
10101,2012-10-19,24185.27,3192014.37,57592.12


# Dropping duplicates
vet_visits.drop_duplicates(subset="name")

name is name of the column, where we want the suplicates to be dropped.

it is possible we'll have same values in name, so we need to set more that one criteria

unique_dogs = vet_visits.drop_duplicates(subset=["name", "breed"])

in this case we'll search for duplicates of name and breed together

unique_dogs["breed].value_counts() will count each entry for us

unique_dogs["breed].value_counts(sort=True) <=== will sort from biggest count on top to smallest 


unique_dogs["breed].value_counts(normalize=True) will show the proportions of counts to total




## Dropping duplicates
Removing duplicates is an essential skill to get accurate counts, because often you don't want to count the same thing multiple times. In this exercise, you'll create some new DataFrames using unique values from sales.

- Remove rows of sales with duplicate pairs of store and type and save as store_types and print the head.
- Remove rows of sales with duplicate pairs of store and department and save as store_depts and print the head.
- Subset the rows that are holiday weeks, and drop the duplicate dates, saving as holiday_dates.
- Select the date column of holiday_dates, and print.

In [10]:
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
2,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
3,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
4,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106


In [8]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store", "type"])
print(store_types.head())

       store type  department       date  weekly_sales  is_holiday  \
0          1    A           1 2010-02-05      24924.50       False   
10244      2    A           1 2010-02-05      35034.06       False   
20482      3    B           1 2010-02-05       6453.58       False   
29518      4    A           1 2010-02-05      38724.42       False   
39790      5    B           1 2010-02-05       9323.89       False   

       temperature_c  fuel_price_usd_per_l  unemployment  
0           5.727778              0.679451         8.106  
10244       4.550000              0.679451         8.324  
20482       7.616667              0.679451         7.368  
29518       6.533333              0.686319         8.623  
39790       4.277778              0.679451         6.566  


In [9]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store", "department"])
print(store_depts.head())

   store type  department       date  weekly_sales  is_holiday  temperature_c  \
0      1    A           1 2010-02-05      24924.50       False       5.727778   
1      1    A           2 2010-02-05      50605.27       False       5.727778   
2      1    A           3 2010-02-05      13740.12       False       5.727778   
3      1    A           4 2010-02-05      39954.04       False       5.727778   
4      1    A           5 2010-02-05      32229.38       False       5.727778   

   fuel_price_usd_per_l  unemployment  
0              0.679451         8.106  
1              0.679451         8.106  
2              0.679451         8.106  
3              0.679451         8.106  
4              0.679451         8.106  


In [11]:
# Subset the rows that are holiday weeks and drop duplicate dates
holiday_dates = sales[sales["is_holiday"] ==True ].drop_duplicates(subset="date")

In [12]:
# Print date col of holiday_dates
print(holiday_dates["date"])

73     2010-02-12
2218   2010-09-10
3014   2010-11-26
3372   2010-12-31
3800   2011-02-11
5940   2011-09-09
6731   2011-11-25
7096   2011-12-30
7527   2012-02-10
9667   2012-09-07
Name: date, dtype: datetime64[ns]


## Counting categorical variables
Counting is a great way to get an overview of your data and to spot curiosities that you might not notice otherwise. In this exercise, you'll count the number of each type of store and the number of each department number using the DataFrames you created in the previous exercise:
```
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store", "type"])

# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store", "department"])
```
The ```store_types``` and ```store_depts``` DataFrames you created in the last exercise are available and pandas is imported as pd.

In [13]:
# Count the number of stores of each type
store_counts = store_types["type"].value_counts()
print(store_counts)

A    22
B    17
C     6
Name: type, dtype: int64


In [22]:
# Get the proportion of stores of each type
store_props = store_types["type"].value_counts(normalize=True)
print(store_props)

A    0.488889
B    0.377778
C    0.133333
Name: type, dtype: float64


In [23]:
# Count the number of each department number and sort
dept_counts_sorted = store_depts["department"].value_counts(sort = True)
print(dept_counts_sorted)

1     45
9     45
4     45
6     45
8     45
      ..
37    20
50    14
43     5
39     5
65     1
Name: department, Length: 81, dtype: int64


In [20]:
# Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort=True, normalize=True)
print(dept_props_sorted)

1     0.013778
9     0.013778
4     0.013778
6     0.013778
8     0.013778
        ...   
37    0.006124
50    0.004287
43    0.001531
39    0.001531
65    0.000306
Name: department, Length: 81, dtype: float64


# Grouped summary statistics
## What percent of sales occurred at each store type?
While .groupby() is useful, you can calculate grouped summary statistics without it.

Walmart distinguishes three types of stores: "supercenters", "discount stores", and "neighborhood markets", encoded in this dataset as type "A", "B", and "C". In this exercise, you'll calculate the total sales made at each store type, without using .groupby(). You can then use these numbers to see what proportion of Walmart's total sales were made at each.
- Calculate the total weekly sales over the whole dataset.
- Subset for type "A" stores, and calculate their total weekly sales.
- Do the same for type "B" and type "C" stores.
- Combine the A/B/C results into a list, and divide by overall sales to get the proportion of sales by type.

In [None]:
# Calc total weekly sales
sales_all = ____["____"].____()

In [None]:
# Subset for type A stores, calc total weekly sales
sales_A = ____[____["____"] == "____"]["____"].____()

In [None]:
# Subset for type B stores, calc total weekly sales
sales_B = ____

In [None]:
# Subset for type C stores, calc total weekly sales
sales_C = ____

In [None]:
# Get proportion for each type
sales_propn_by_type = [sales_A, ____, ____] / ____
print(sales_propn_by_type)