### Imports 

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import matplotlib.ticker as ticker
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.set_option("display.max_rows", 21, "display.max_columns", 21)

### I. Prepair data

In [14]:
data_frame = pd.read_excel('Sample - Superstore.xls')

In [12]:
data_frame.shape

(9994, 21)

In [16]:
data_frame.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [18]:
data_frame.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [24]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9994 non-null   int64         
 1   Order ID       9994 non-null   object        
 2   Order Date     9994 non-null   datetime64[ns]
 3   Ship Date      9994 non-null   datetime64[ns]
 4   Ship Mode      9994 non-null   object        
 5   Customer ID    9994 non-null   object        
 6   Customer Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal Code    9994 non-null   int64         
 12  Region         9994 non-null   object        
 13  Product ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15  Sub-Category   9994 n

*Các kiểu giá trị ở các cột đều phù hợp*

In [25]:
data_frame.isna().sum()

Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64

*Không có giá trị null nào trong dataframe*

In [26]:
data_frame.duplicated().sum()

0

*Không có giá trị trùng lặp nào trong dataframe*

### II. How have profit and sales changed over the years?

In [31]:
data_frame['Year']=data_frame['Order Date'].dt.year
data_frame['Year']

0       2016
1       2016
2       2016
3       2015
4       2015
        ... 
9989    2014
9990    2017
9991    2017
9992    2017
9993    2017
Name: Year, Length: 9994, dtype: int64

In [39]:
data_frame['Year'].value_counts()

2017    3312
2016    2587
2015    2102
2014    1993
Name: Year, dtype: int64

In [49]:
profit_sales_data = data_frame.groupby(by="Year")[["Sales", "Profit"]].mean().reset_index()
profit_sales_data

Unnamed: 0,Year,Sales,Profit
0,2014,242.974159,24.858994
1,2015,223.849909,29.314274
2,2016,235.487282,31.617771
3,2017,221.381418,28.21234


In [64]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=profit_sales_data["Year"], y=profit_sales_data["Sales"], name='Sales',
                         line=dict(color='firebrick', width=4)))
fig.add_trace(go.Scatter(x=profit_sales_data["Year"], y=profit_sales_data["Profit"], name='Profit',
                         line = dict(color='firebrick', width=4, dash='dot')))
fig.add_bar(x=profit_sales_data["Year"], y=profit_sales_data["Sales"], name='Sales', marker_color='indianred')
fig.add_trace(go.Bar(x=profit_sales_data["Year"], y=profit_sales_data["Profit"], name='Profit', marker_color='lightsalmon'))
fig.update_layout(title='Profit and Sales changed over the years',
                   xaxis_title='Years',
                   yaxis_title='')
fig.show()

Write somethings

### III. How do profit Rate (Profit/Sales) over years?

In [61]:
profit_sales_data["Profit Rate"] = profit_sales_data["Profit"]/profit_sales_data["Sales"]
profit_sales_data

Unnamed: 0,Year,Sales,Profit,Profit Rate
0,2014,242.974159,24.858994,0.102311
1,2015,223.849909,29.314274,0.130955
2,2016,235.487282,31.617771,0.134265
3,2017,221.381418,28.21234,0.127438


In [81]:
fig_3 = make_subplots(specs=[[{"secondary_y": True}]])
fig_3.add_bar(x=profit_sales_data["Year"], y=profit_sales_data["Sales"], name='Sales',
              marker_color='MediumPurple', secondary_y=False)
fig_3.add_bar(x=profit_sales_data["Year"], y=profit_sales_data["Profit"], name='Profit',
              marker_color="lightskyblue", secondary_y=False)
fig_3.add_trace(go.Scatter(x=profit_sales_data["Year"], y=profit_sales_data["Profit Rate"]*100,
                           name="Profit Rate"), secondary_y=True)
fig_3.update_yaxes(title_text="", secondary_y=False)
fig_3.update_yaxes(title_text="(%)", secondary_y=True)


### IV. How do profit and sales change over months in 2017?

In [82]:
data_frame['Month']=data_frame['Order Date'].dt.month
data_frame['Month']

0       11
1       11
2        6
3       10
4       10
        ..
9989     1
9990     2
9991     2
9992     2
9993     5
Name: Month, Length: 9994, dtype: int64

In [86]:
data_frame_2017 = data_frame[data_frame.Year==2017]
profit_sales_2017_over_months = data_frame_2017.groupby(by="Month")[["Sales", "Profit"]].mean().reset_index()
profit_sales_2017_over_months

Unnamed: 0,Month,Sales,Profit
0,1,283.686284,46.067349
1,2,189.730219,15.082916
2,3,247.362827,61.982737
3,4,179.909045,4.597488
4,5,182.89715,26.20902
5,6,216.251942,33.564636
6,7,200.285027,30.763811
7,8,289.545358,41.472274
8,9,191.430614,23.946744
9,10,260.996387,31.125086


In [87]:
fig_4 = go.Figure()
fig_4.add_trace(go.Scatter(x=profit_sales_2017_over_months["Month"], 
                           y=profit_sales_2017_over_months["Sales"], name='Sales',
                           line=dict(color='firebrick', width=4)))
fig_4.add_trace(go.Scatter(x=profit_sales_2017_over_months["Month"],
                           y=profit_sales_2017_over_months["Profit"], name='Profit',
                           line = dict(color='firebrick', width=4, dash='dot')))
fig_4.add_bar(x=profit_sales_2017_over_months["Month"],
              y=profit_sales_2017_over_months["Sales"],
              name='Sales', marker_color='indianred')
fig_4.add_bar(x=profit_sales_2017_over_months["Month"], 
              y=profit_sales_2017_over_months["Profit"], 
              name='Profit', marker_color='lightsalmon')
fig_4.update_layout(title='Profit and Sales changed over the years',
                   xaxis_title='Years',
                   yaxis_title='')
fig_4.show()

### V. Which states have the highest return rates?

### VI. Which month of the year has the most orders?

### VII. Top 10 best-selling products?

### VIII. Which sub-category are usually sold together?