In [1]:
# Import Libraries and Dependencies
import pandas as pd

### 1. Combine and Clean the Data
#### Import CSVs

In [2]:
# Read the CSV files into DataFrames.
as2020_df = pd.read_csv('Resources/athletic_sales_2020.csv')
as2021_df = pd.read_csv('Resources/athletic_sales_2021.csv')

In [3]:
# Display the 2020 sales DataFrame
as2020_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [4]:
# Display the 2021 sales DataFrame
as2021_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,65,750,487500,121875.0,Outlet
1,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,51,233,11883,3208.41,Outlet
2,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,50,275,137500,82500.0,Outlet


#### Check the data types of each DataFrame

In [5]:
# Check the 2020 sales data types.
as2020_df.dtypes

retailer             object
retailer_id           int64
invoice_date         object
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object

In [6]:
# Check the 2021 sales data types.
as2021_df.dtypes 

retailer             object
retailer_id           int64
invoice_date         object
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object

#### Combine the sales data by rows.

In [7]:
# Combine the 2020 and 2021 sales DataFrames on the rows and reset the index.
as_years_df = pd.concat([as2020_df,as2021_df],axis=0, join='inner').reset_index(drop=True)
as_years_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [8]:
# Check if any values are null.
print (f'any is null {as_years_df.isnull().any().any()}')
print (f'any is na {as_years_df.isna().any().any()}')
print (f'any is not na {as_years_df.notna().any().any()}')


any is null False
any is na False
any is not na True


In [9]:
# Check the data type of each column
as_years_df.dtypes

retailer             object
retailer_id           int64
invoice_date         object
region               object
state                object
city                 object
product              object
price_per_unit        int64
units_sold            int64
total_sales           int64
operating_profit    float64
sales_method         object
dtype: object

In [10]:
# Convert the "invoice_date" to a datetime datatype 
as_years_df['invoice_date'] = pd.to_datetime(as_years_df['invoice_date'],  format='mixed')
as_years_df.head(3)
# this is to test single year as2021
dt_as2021_df['invoice_date'] = pd.to_datetime(as`_years_df['invoice_date'],  format='mixed')
as_years_df.head(3
                 
                 )

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,2020-01-01,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,2020-01-01,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,2020-01-01,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [11]:
# Confirm that the "invoice_date" data type has been changed.
as_years_df['invoice_date'].dtypes

dtype('<M8[ns]')

### 2. Determine which Region Sold the Most Products

#### Using `groupby`

In [12]:
# Show the number products sold for region, state, and city.
# Rename the sum to "Total_Products_Sold".
groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})
print (f"{(groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))},\n")

# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True))
print(sort_groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095,

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


In [13]:
# Show the number products sold for region, state, and city.
# Rename the sum to "Total_Products_Sold".

groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})
print (f"{(groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))},\n")
# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True))
print(sort_groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095,

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


#### Using `pivot_table`

In [14]:
# Show the number products sold for region, state, and city.
# Rename the "units_sold" column to "Total_Products_Sold"
pt_rsc_df = as_years_df.pivot_table(index=['region','state','city'],
                                    values='units_sold',
                                    aggfunc='sum').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})                                    
print (f'{pt_rsc_df.head(5)}\n')

# Show the top 5 results.
print (pt_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True)).head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


### 3. Determine which Region had the Most Sales

#### Using `groupby`

In [15]:
# Show the total sales for the products sold for each region, state, and city.
# Rename the "total_sales" column to "Total Sales"
groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('total_sales').reset_index().rename(columns={'total_sales':'Total Sales'})
print (f"{groupby_rsc_df[['region','state','city','Total Sales']].head(5)}\n")
# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
print (sort_groupby_rsc_df[['region','state','city','Total Sales']].head(5))


    region     state          city  Total Sales
0  Midwest  Illinois       Chicago      9797488
1  Midwest   Indiana  Indianapolis      8836198
2  Midwest      Iowa    Des Moines      7424011
3  Midwest    Kansas       Wichita      9972864
4  Midwest  Michigan       Detroit     18625433

      region           state           city  Total Sales
0  Northeast        New York       New York     39801235
1       West      California  San Francisco     33973228
2  Southeast         Florida          Miami     31600863
3  Southeast  South Carolina     Charleston     29285637
4  Southeast         Florida        Orlando     27682851


#### Using `pivot_table`

In [16]:
# Show the total sales for the products sold for each region, state, and city.
# Optional: Rename the "total_sales" column to "Total Sales"
pt_rsc_df = as_years_df.pivot_table(index=['region','state','city'],
                                    values='total_sales',
                                    aggfunc='sum').reset_index().rename(columns={'total_sales':'Total Sales'})\
                                    .sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
# Show the top 5 results.
pt_rsc_df.head(5)

Unnamed: 0,region,state,city,Total Sales
0,Northeast,New York,New York,39801235
1,West,California,San Francisco,33973228
2,Southeast,Florida,Miami,31600863
3,Southeast,South Carolina,Charleston,29285637
4,Southeast,Florida,Orlando,27682851


### 4. Determine which Retailer had the Most Sales

#### Using `groupby`

In [17]:
# Show the total sales for the products sold for each retailer, region, state, and city.
# Rename the "total_sales" column to "Total Sales"

groupby_rsc_df = as_years_df.groupby(['retailer','region','state','city']).sum('total_sales').reset_index().rename(columns={'total_sales':'Total Sales'})
print (f"{groupby_rsc_df[['retailer','region','state','city','Total Sales']].head(5)}\n")

# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
print (sort_groupby_rsc_df[['retailer','region','state','city','Total Sales']].head(5))


  retailer     region          state        city  Total Sales
0   Amazon    Midwest           Ohio    Columbus     16835873
1   Amazon  Northeast          Maine    Portland      8611395
2   Amazon  Northeast  Massachusetts      Boston      4193590
3   Amazon  Northeast  New Hampshire  Manchester     10077142
4   Amazon  Northeast        Vermont  Burlington     13380463

      retailer     region           state           city  Total Sales
0    West Gear       West      California  San Francisco     32794405
1       Kohl's       West      California    Los Angeles     25127160
2  Foot Locker  Northeast        New York       New York     25008568
3    West Gear       West      Washington        Seattle     24862675
4  Foot Locker  Southeast  South Carolina     Charleston     24822280


#### Using `pivot_table`

In [18]:
# Show the total sales for the products sold for each retailer, region, state, and city.


# Optional: Rename the "total_sales" column to "Total Sales"


# Show the top 5 results.


### 5. Determine which Retailer Sold the Most Women's Athletic Footwear

In [19]:
# Filter the sales data to get the women's athletic footwear sales data.
# get column value for womens footwear
products=as_years_df["product"].unique()
# print each unique product with the product Index
for index, value in enumerate(products):
    print(f"{index} : {value}")
idx=int(input('enter the number for the product : '))
# filter product by womens footwear
# filtered_df = df[df['column_name'].isin(['value1', 'value2', 'value3'])]
as_wfw_df = as_years_df[as_years_df["product"].isin([products[idx]])]
print (f"\n{as_wfw_df[['retailer','region','state','city','product']].tail(3).to_string(index=False)}")




0 : Men's Street Footwear
1 : Women's Apparel
2 : Men's Athletic Footwear
3 : Women's Street Footwear
4 : Women's Athletic Footwear
5 : Men's Apparel

   retailer    region        state         city                   product
     Amazon Northeast        Maine     Portland Women's Athletic Footwear
     Amazon Northeast        Maine     Portland Women's Athletic Footwear
Foot Locker Northeast Pennsylvania Philadelphia Women's Athletic Footwear


### Using `groupby`

In [20]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"

groupby_rsc_df = as_wfw_df.groupby(['retailer','region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':'Womens_Footwear_Units_Sold'})
print (f"{products[idx]}\n{groupby_rsc_df[['retailer','region','state','city','Womens_Footwear_Units_Sold']].head(5).to_string(index=False)}\n")

# Show the top 5 results.

sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Womens_Footwear_Units_Sold'], ascending=False, ignore_index=(True))
print (f"{products[idx]}\n{sort_groupby_rsc_df[['retailer','region','state','city','Womens_Footwear_Units_Sold']].head(5).to_string(index=False)}")

Women's Athletic Footwear
retailer    region         state       city  Womens_Footwear_Units_Sold
  Amazon   Midwest          Ohio   Columbus                        5801
  Amazon Northeast         Maine   Portland                        1841
  Amazon Northeast Massachusetts     Boston                        1190
  Amazon Northeast New Hampshire Manchester                        3109
  Amazon Northeast       Vermont Burlington                        4327

Women's Athletic Footwear
     retailer    region          state          city  Womens_Footwear_Units_Sold
    West Gear      West     California San Francisco                       12107
  Foot Locker Northeast       New York      New York                       10996
       Kohl's      West     California   Los Angeles                       10826
  Foot Locker Southeast South Carolina    Charleston                        8814
Sports Direct     South          Texas        Dallas                        8790


#### Using `pivot_table`

In [21]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"
pt_rsc_df = as_wfw_df.pivot_table(index=['retailer','region','state','city'],
                                    values='units_sold',
                                    aggfunc='sum').reset_index().rename(columns={'units_sold':'Womens_Footwear_Units_Sold'})
print(f"\n{pt_rsc_df.head(5).to_string(index=False)}\n")
# Show the top 5 results.
print(f"Sorted by units sold \n{pt_rsc_df.sort_values(by=['Womens_Footwear_Units_Sold'], ascending=False, ignore_index=(True)).head(5).to_string(index=False)}")


retailer    region         state       city  Womens_Footwear_Units_Sold
  Amazon   Midwest          Ohio   Columbus                        5801
  Amazon Northeast         Maine   Portland                        1841
  Amazon Northeast Massachusetts     Boston                        1190
  Amazon Northeast New Hampshire Manchester                        3109
  Amazon Northeast       Vermont Burlington                        4327

Sorted by units sold 
     retailer    region          state          city  Womens_Footwear_Units_Sold
    West Gear      West     California San Francisco                       12107
  Foot Locker Northeast       New York      New York                       10996
       Kohl's      West     California   Los Angeles                       10826
  Foot Locker Southeast South Carolina    Charleston                        8814
Sports Direct     South          Texas        Dallas                        8790


### 5. Determine the Day with the Most Women's Athletic Footwear Sales

In [26]:
# Create a pivot table with the 'invoice_date' column is the index, and the "total_sales" as the values.
pt_daily_sales_df = as_years_df.pivot_table(index='invoice_date',
                                    values='total_sales').rename(columns={'total_sales':'Total Sales'})

# Optional: Rename the "total_sales" column to "Total Sales"


# Show the table.
pt_daily_sales_df.head(5)

Unnamed: 0_level_0,Total Sales
invoice_date,Unnamed: 1_level_1
2020-01-01,140856.833333
2020-01-02,114901.666667
2020-01-03,105428.833333
2020-01-04,102513.333333
2020-01-05,117971.5


In [33]:
# Resample the pivot table into daily bins, and get the total sales for each day.
bin_daily_sum = pt_daily_sales_df.resample('D').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
bin_daily_sum.sort_values(by=['Total Sales'] , ascending=True, ignore_index=(False)).head(10)

Unnamed: 0_level_0,Total Sales
invoice_date,Unnamed: 1_level_1
2020-12-31,0.0
2020-10-12,0.0
2020-10-13,0.0
2020-10-14,0.0
2020-10-15,0.0
2020-10-16,0.0
2020-10-11,0.0
2020-02-29,0.0
2020-12-13,13461.0
2020-12-07,15997.0


### 6.  Determine the Week with the Most Women's Athletic Footwear Sales

In [None]:
# Resample the pivot table into weekly bins, and get the total sales for each week.


# Sort the resampled pivot table in ascending order on "Total Sales".
