In [56]:
# Import Libraries and Dependencies
import pandas as pd

### 1. Combine and Clean the Data
#### Import CSVs

In [57]:
# Read the CSV files into DataFrames.
as2020_df = pd.read_csv('Resources/athletic_sales_2020.csv')
as2021_df = pd.read_csv('Resources/athletic_sales_2021.csv')

In [58]:
# Display the 2020 sales DataFrame
columns_2020 = as2020_df.columns.values.tolist()
as2020_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [59]:
# Display the 2021 sales DataFrame
columns_2021 = as2020_df.columns.values.tolist()
as2021_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,65,750,487500,121875.0,Outlet
1,West Gear,1128299,1/1/21,West,California,San Francisco,Men's Athletic Footwear,51,233,11883,3208.41,Outlet
2,Kohl's,1189833,1/1/21,Midwest,Montana,Billings,Men's Apparel,50,275,137500,82500.0,Outlet


#### Check the data types of each DataFrame

In [60]:
# Check the 2020 sales data types.
dtypes_2020 = as2020_df.dtypes.values.tolist()
dtypes_2020

[dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('O')]

In [61]:
# Check the 2021 sales data types.
dtypes_2021 = as2020_df.dtypes.values.tolist()
dtypes_2021

[dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('O')]

In [62]:
# my old eyes and memory aren't really good at comparing columns and types so I wrote this to do it for me
if columns_2020 != columns_2021:
    raise ValueError("The two data frames are not compatible. Please verify the columns.")

elif dtypes_2020 != dtypes_2021:
    raise ValueError("The two data frames are not compatible. Please verify the data types.")

else:
    print('The as2020_df and as2021_df data frames have matching columns and data types.')

The as2020_df and as2021_df data frames have matching columns and data types.


#### Combine the sales data by rows.

In [63]:
# Combine the 2020 and 2021 sales DataFrames on the rows and reset the index.
as_years_df = pd.concat([as2020_df,as2021_df],axis=0, join='inner').reset_index(drop=True)
as_years_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,1/1/20,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,1/1/20,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [64]:
# Check if any values are null.
print (f'any is null {as_years_df.isnull().any().any()}')
print (f'any is na {as_years_df.isna().any().any()}')
print (f'any is not na {as_years_df.notna().any().any()}')


any is null False
any is na False
any is not na True


In [65]:
# Check the data type of each column
# here i want to use the same check as i did earlier. 
dtype_as_years =as_years_df.dtypes.values.tolist()
if dtype_as_years != dtypes_2021:
    raise ValueError("something is wrong with the data types in the new table")

else:
    print('The data types of the new dataframe are the same as the as2020_df and as2021_af\n{as_years_df.dtypes}')

The data types of the new dataframe are the same as the as2020_df and as2021_af
{as_years_df.dtypes}


In [66]:
# Convert the "invoice_date" to a datetime datatype 
as_years_df['invoice_date'] = pd.to_datetime(as_years_df['invoice_date'],  format='mixed')
as_years_df.head(3)

Unnamed: 0,retailer,retailer_id,invoice_date,region,state,city,product,price_per_unit,units_sold,total_sales,operating_profit,sales_method
0,Foot Locker,1185732,2020-01-01,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000.0,In-store
1,Foot Locker,1185732,2020-01-01,Northeast,Pennsylvania,Philadelphia,Women's Apparel,68,83,5644,2426.92,Online
2,Foot Locker,1185732,2020-01-01,Northeast,Pennsylvania,Philadelphia,Women's Apparel,75,275,206250,61875.0,Outlet


In [67]:
# Confirm that the "invoice_date" data type has been changed.
as_years_df['invoice_date'].dtypes

dtype('<M8[ns]')

### 2. Determine which Region Sold the Most Products

#### Using `groupby`

In [68]:
# Show the number products sold for region, state, and city.
# Rename the sum to "Total_Products_Sold".
groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})
print (f"{(groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))},\n")

# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True))
print(sort_groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095,

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


In [69]:
# Show the number products sold for region, state, and city.
# Rename the sum to "Total_Products_Sold".

groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})
print (f"{(groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))},\n")
# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True))
print(sort_groupby_rsc_df[['region','state','city','Total_Products_Sold']].head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095,

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


#### Using `pivot_table`

In [70]:
# Show the number products sold for region, state, and city.
# Rename the "units_sold" column to "Total_Products_Sold"
pt_rsc_df = as_years_df.pivot_table(index=['region','state','city'],
                                    values='units_sold',
                                    aggfunc='sum').reset_index().rename(columns={'units_sold':'Total_Products_Sold'})                                    
print (f'{pt_rsc_df.head(5)}\n')

# Show the top 5 results.
print (pt_rsc_df.sort_values(by=['Total_Products_Sold'], ascending=False, ignore_index=(True)).head(5))

    region     state          city  Total_Products_Sold
0  Midwest  Illinois       Chicago                25407
1  Midwest   Indiana  Indianapolis                26332
2  Midwest      Iowa    Des Moines                23446
3  Midwest    Kansas       Wichita                29463
4  Midwest  Michigan       Detroit                50095

      region       state           city  Total_Products_Sold
0  Northeast    New York       New York               111954
1      South       Texas        Houston                90322
2       West  California  San Francisco                85478
3       West  California    Los Angeles                76384
4  Southeast     Florida          Miami                73135


### 3. Determine which Region had the Most Sales

#### Using `groupby`

In [71]:
# Show the total sales for the products sold for each region, state, and city.
# Rename the "total_sales" column to "Total Sales"
groupby_rsc_df = as_years_df.groupby(['region','state','city']).sum('total_sales').reset_index().rename(columns={'total_sales':'Total Sales'})
print (f"{groupby_rsc_df[['region','state','city','Total Sales']].head(5)}\n")
# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
print (sort_groupby_rsc_df[['region','state','city','Total Sales']].head(5))


    region     state          city  Total Sales
0  Midwest  Illinois       Chicago      9797488
1  Midwest   Indiana  Indianapolis      8836198
2  Midwest      Iowa    Des Moines      7424011
3  Midwest    Kansas       Wichita      9972864
4  Midwest  Michigan       Detroit     18625433

      region           state           city  Total Sales
0  Northeast        New York       New York     39801235
1       West      California  San Francisco     33973228
2  Southeast         Florida          Miami     31600863
3  Southeast  South Carolina     Charleston     29285637
4  Southeast         Florida        Orlando     27682851


#### Using `pivot_table`

In [72]:
# Show the total sales for the products sold for each region, state, and city.
# Optional: Rename the "total_sales" column to "Total Sales"
pt_rsc_df = as_years_df.pivot_table(index=['region','state','city'],
                                    values='total_sales',
                                    aggfunc='sum').reset_index().rename(columns={'total_sales':'Total Sales'})\
                                    .sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
# Show the top 5 results.
pt_rsc_df.head(5)

Unnamed: 0,region,state,city,Total Sales
0,Northeast,New York,New York,39801235
1,West,California,San Francisco,33973228
2,Southeast,Florida,Miami,31600863
3,Southeast,South Carolina,Charleston,29285637
4,Southeast,Florida,Orlando,27682851


### 4. Determine which Retailer had the Most Sales

#### Using `groupby`

In [73]:
# Show the total sales for the products sold for each retailer, region, state, and city.
# Rename the "total_sales" column to "Total Sales"

groupby_rsc_df = as_years_df.groupby(['retailer','region','state','city']).sum('total_sales').reset_index().rename(columns={'total_sales':'Total Sales'})
print (f"{groupby_rsc_df[['retailer','region','state','city','Total Sales']].head(5)}\n")

# Show the top 5 results.
sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=['Total Sales'], ascending=False, ignore_index=(True))
print (sort_groupby_rsc_df[['retailer','region','state','city','Total Sales']].head(5))


  retailer     region          state        city  Total Sales
0   Amazon    Midwest           Ohio    Columbus     16835873
1   Amazon  Northeast          Maine    Portland      8611395
2   Amazon  Northeast  Massachusetts      Boston      4193590
3   Amazon  Northeast  New Hampshire  Manchester     10077142
4   Amazon  Northeast        Vermont  Burlington     13380463

      retailer     region           state           city  Total Sales
0    West Gear       West      California  San Francisco     32794405
1       Kohl's       West      California    Los Angeles     25127160
2  Foot Locker  Northeast        New York       New York     25008568
3    West Gear       West      Washington        Seattle     24862675
4  Foot Locker  Southeast  South Carolina     Charleston     24822280


#### Using `pivot_table`

In [74]:
# Show the total sales for the products sold for each retailer, region, state, and city.


# Optional: Rename the "total_sales" column to "Total Sales"


# Show the top 5 results.


### 5. Determine which Retailer Sold the Most Women's Athletic Footwear

In [75]:
# Filter the sales data to get the women's athletic footwear sales data.
# get column value for womens footwear
products=as_years_df["product"].unique()
# print each unique product with the product Index
for index, value in enumerate(products):
    print(f"{index} : {value}")
idx=int(input('enter the number for the product : '))
# filter product by womens footwear
# filtered_df = df[df['column_name'].isin(['value1', 'value2', 'value3'])]
as_wfw_df = as_years_df[as_years_df["product"].isin([products[idx]])]
print (f"\n{as_wfw_df[['retailer','region','state','city','product']].tail(3).to_string(index=False)}")




0 : Men's Street Footwear
1 : Women's Apparel
2 : Men's Athletic Footwear
3 : Women's Street Footwear
4 : Women's Athletic Footwear
5 : Men's Apparel

   retailer    region        state         city         product
     Amazon Northeast        Maine     Portland Women's Apparel
Foot Locker Northeast Pennsylvania Philadelphia Women's Apparel
Foot Locker Northeast Pennsylvania Philadelphia Women's Apparel


### Using `groupby`

In [92]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
# Rename the "units_sold" column to "{Product[idx]}_Units_Sold"
product_unit_sold = str(products[idx])+' Units Sold'
groupby_rsc_df = as_wfw_df.groupby(['retailer','region','state','city']).sum('units_sold').reset_index().rename(columns={'units_sold':product_unit_sold})
print (f"{products[idx]}\n{groupby_rsc_df[['retailer','region','state','city',product_unit_sold]].head(5).to_string(index=False)}\n")

# Show the top 5 results.

sort_groupby_rsc_df = groupby_rsc_df.sort_values(by=[product_unit_sold], ascending=False, ignore_index=(True))
print (f"{products[idx]}\n{sort_groupby_rsc_df[['retailer','region','state','city',product_unit_sold]].head(5).to_string(index=False)}")

Women's Apparel
retailer    region         state       city  Women's Apparel Units Sold
  Amazon   Midwest          Ohio   Columbus                        7755
  Amazon Northeast         Maine   Portland                        4083
  Amazon Northeast Massachusetts     Boston                        2096
  Amazon Northeast New Hampshire Manchester                        5078
  Amazon Northeast       Vermont Burlington                        6511

Women's Apparel
     retailer region      state          city  Women's Apparel Units Sold
    West Gear   West California San Francisco                       14480
Sports Direct  South      Texas        Dallas                       13056
       Kohl's   West California   Los Angeles                       12968
      Walmart  South      Texas       Houston                       12797
Sports Direct  South  Tennessee     Knoxville                       12345


#### Using `pivot_table`

In [77]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"
pt_rsc_df = as_wfw_df.pivot_table(index=['retailer','region','state','city'],
                                    values='units_sold',
                                    aggfunc='sum').reset_index().rename(columns={'units_sold':'Womens_Footwear_Units_Sold'})
print(f"\n{pt_rsc_df.head(5).to_string(index=False)}\n")
# Show the top 5 results.
print(f"Sorted by units sold \n{pt_rsc_df.sort_values(by=['Womens_Footwear_Units_Sold'], ascending=False, ignore_index=(True)).head(5).to_string(index=False)}")


retailer    region         state       city  Womens_Footwear_Units_Sold
  Amazon   Midwest          Ohio   Columbus                        7755
  Amazon Northeast         Maine   Portland                        4083
  Amazon Northeast Massachusetts     Boston                        2096
  Amazon Northeast New Hampshire Manchester                        5078
  Amazon Northeast       Vermont Burlington                        6511

Sorted by units sold 
     retailer region      state          city  Womens_Footwear_Units_Sold
    West Gear   West California San Francisco                       14480
Sports Direct  South      Texas        Dallas                       13056
       Kohl's   West California   Los Angeles                       12968
      Walmart  South      Texas       Houston                       12797
Sports Direct  South  Tennessee     Knoxville                       12345


### 5. Determine the Day with the Most Women's Athletic Footwear Sales

In [78]:
# Create a pivot table with the 'invoice_date' column is the index, and the "total_sales" as the values.
pt_wfw_sales_df = as_wfw_df.pivot_table(index='invoice_date',
                                    values='total_sales',
                                    aggfunc='sum')\
                                    .rename(columns={'total_sales':'Total Sales'})

# Show the table.
pt_wfw_sales_df.head(5)

Unnamed: 0_level_0,Total Sales
invoice_date,Unnamed: 1_level_1
2020-01-01,216293
2020-01-06,524500
2020-01-07,224974
2020-01-13,189096
2020-01-19,216162


In [79]:
# Resample the pivot table into daily bins, and get the total sales for each day.
bin_daily_sum = pt_wfw_sales_df.resample('D').sum()
# Sort the resampled pivot table in ascending order on "Total Sales".
daily_high_wafs = bin_daily_sum.sort_values(by=['Total Sales'] , ascending=False).reset_index()

print   (f"The day with the Most {products[idx]} Sales is \n{daily_high_wafs.iloc[0,0].strftime('%Y-%m-%d')} with sales of {daily_high_wafs.iloc[0,1]}")


The day with the Most Women's Apparel Sales is 
2021-06-17 with sales of 2577406


### 6.  Determine the Week with the Most Women's Athletic Footwear Sales

In [80]:
# Resample the pivot table into weekly bins, and get the total sales for each week.
bin_weekly_sum = pt_wfw_sales_df.resample('1W').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
weekly_high_wafs = bin_weekly_sum.sort_values(by=['Total Sales'] , ascending=False).reset_index()
                
print (f"The week with the Most {products[idx]} Sales is \nweek {weekly_high_wafs.iloc[0, 0].week} of {weekly_high_wafs.iloc[0, 0].year} with sales of {weekly_high_wafs.iloc[0, 1]}")

The week with the Most Women's Apparel Sales is 
week 27 of 2021 with sales of 5774816
