# Pandas (part 1)

In [1]:
import pandas as pd
import numpy as np

## 1. Data frames

- Data frame thể hiện cấu trúc dạng bảng như excel sheet
- Có rows và columns
- Mỗi column là một Series

### 1.1. Khởi tạo

- Từ dictionary

In [2]:
d = {
    "name": ["John", "Bob", "Jane"],
    "age": [18, 20, 30],
    "edu": ["BS", "MS", "BS"]
}

df = pd.DataFrame(d)
df

Unnamed: 0,name,age,edu
0,John,18,BS
1,Bob,20,MS
2,Jane,30,BS


- Từ 2d-list

In [3]:
l = [
    ("John", 18, "BS"),
    ("Bob", 20, "MS"),
    ("Jane", 30, "BS")
]

df = pd.DataFrame(l, columns=["name", "age", "edu"])
df

Unnamed: 0,name,age,edu
0,John,18,BS
1,Bob,20,MS
2,Jane,30,BS


### 1.2. Basic operations

#### A) Input
- Dùng `pd.read_xxx`. VD: `read_csv`, `read_excel`, ...

In [8]:
data_path = 'D:\\DA\\PhantichdulieuPython\\data\\'
data_name = 'OnlineRetail.csv'
df = pd.read_csv(data_path + data_name, encoding= 'unicode_escape')

# encoding= 'unicode_escape' được sử dụng khi data gặp lỗi không đọc được 1 vài ký tự non-ascii nào đó
# Thường thì lỗi sẽ được báo như ví dụ sau khi dùng 'pd.read_csv(data_path + data_name)':
# 'utf-8' codec can't decode byte 0xa3 in position 79780: invalid start byte

#### B) Inpsect

- View first few rows

In [9]:
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


- View last few rows

In [10]:
df.tail(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/2011 12:50,4.95,12680.0,France


- Num rows and num cols

In [12]:
# Shape
df.shape

(541909, 8)

In [13]:
# Num rows
df.shape[0]

541909

In [14]:
# Num cols
df.shape[1]

8

- Column names

In [15]:
df.columns.tolist()

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

- Dtype của từng cột

In [None]:
df.dtypes

- Transpose đổi hàng thành cột (for better view)

In [None]:
df.head().T

#### C) Thao tác với cột

- Truy xuất 1 cột -> trả về 1 series

In [16]:
df["InvoiceNo"]

0         536365
1         536365
2         536365
3         536365
4         536365
           ...  
541904    581587
541905    581587
541906    581587
541907    581587
541908    581587
Name: InvoiceNo, Length: 541909, dtype: object

- Truy xuất nhiều cột -> trả về data frame

In [17]:
df[["InvoiceNo", "UnitPrice", "Quantity"]].head()

Unnamed: 0,InvoiceNo,UnitPrice,Quantity
0,536365,2.55,6
1,536365,3.39,6
2,536365,2.75,8
3,536365,3.39,6
4,536365,3.39,6


- Chọn nhiều cột, dùng `.loc` (recommended)

In [18]:
df.loc[:, ["InvoiceNo", "UnitPrice", "Quantity"]].head()

Unnamed: 0,InvoiceNo,UnitPrice,Quantity
0,536365,2.55,6
1,536365,3.39,6
2,536365,2.75,8
3,536365,3.39,6
4,536365,3.39,6


- Chọn nhiều cột liên tiếp

In [19]:
df.loc[:, "Quantity":"Country"].head()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


- Update cột có sẵn

In [20]:
# Giả sử quantity được lưu dưới đơn vị 1000
# Giá trị 6 nghĩa là 6000
# Update lại cột này cho đúng giá trị
df["Quantity"] = df["Quantity"] * 1000
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6000,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [21]:
# Update lại cột InvoiceDate về dạng datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

- Thêm cột mới

In [22]:
# Thêm cột Revenue = Quantity * Unit price
df["Revenue"] = df["Quantity"] * df["UnitPrice"]
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0


In [23]:
# Thêm cột mới từ một literal
# Giả sử file này đến từ store 1, có thể sẽ được combined với các file từ store khác
# Tạo 1 cột mới để đánh dấu các đơn hàng này đến từ store nào
df["Store"] = "store 1"
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Store
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,store 1
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1


In [24]:
# Thêm cột mới theo các điều kiện dựa trên cột có sẵn:

# Define function
def get_good_type(price):
    if price < 1:
        return "Cheap"
    elif price < 10:
        return "Regular"
    else:
        return "Luxury"

In [25]:
# Test: 0.5, 1, 10, 50
get_good_type(0.5)

'Cheap'

In [26]:
# Apply
df["GoodType"] = df["UnitPrice"].apply(get_good_type)
df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Store,GoodType
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,store 1,Regular
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8000,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22000.0,store 1,Regular
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular


In [27]:
# View GoodType distribution
df["GoodType"].value_counts()

GoodType
Regular    402201
Cheap      114670
Luxury      25038
Name: count, dtype: int64

- Các thao tác với cột kiểu `datetime`

In [28]:
# Thêm cột Year 
df["Year"] = df["InvoiceDate"].dt.year
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Store,GoodType,Year
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,store 1,Regular,2010
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular,2010


In [29]:
# Thêm cột Month 
df["Month"] = df["InvoiceDate"].dt.month
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Store,GoodType,Year,Month
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,store 1,Regular,2010,12
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular,2010,12


In [30]:
# Thêm cột Date 
df["Date"] = df["InvoiceDate"].dt.day
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Store,GoodType,Year,Month,Date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,store 1,Regular,2010,12,1
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,store 1,Regular,2010,12,1


- Xóa cột

In [31]:
# Xóa cột Store vừa tạo dùng del
del df["Store"]
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,GoodType,Year,Month,Date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15300.0,Regular,2010,12,1
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20340.0,Regular,2010,12,1


In [32]:
# Xóa cột Revenue dùng drop (nhớ set inplace=True nếu muốn lưu thay đổi)
# Cách này dùng xóa nhiều cột rất tiện
df.drop(columns=["Revenue","Date"], inplace=True)
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,GoodType,Year,Month
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,Regular,2010,12
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Regular,2010,12


In [33]:
# Do chúng ta cần sử dụng 2 cột này trong các bài tiếp theo, nên phải tạo lại

df["Revenue"] = df["Quantity"] * df["UnitPrice"]
df["Date"] = df["InvoiceDate"].dt.day

- Unique values

In [34]:
# Đếm số unique countries
df["Country"].nunique()

38

In [35]:
# Lấy ra danh sách unique countries
df["Country"].unique().tolist()

['United Kingdom',
 'France',
 'Australia',
 'Netherlands',
 'Germany',
 'Norway',
 'EIRE',
 'Switzerland',
 'Spain',
 'Poland',
 'Portugal',
 'Italy',
 'Belgium',
 'Lithuania',
 'Japan',
 'Iceland',
 'Channel Islands',
 'Denmark',
 'Cyprus',
 'Sweden',
 'Austria',
 'Israel',
 'Finland',
 'Bahrain',
 'Greece',
 'Hong Kong',
 'Singapore',
 'Lebanon',
 'United Arab Emirates',
 'Saudi Arabia',
 'Czech Republic',
 'Canada',
 'Unspecified',
 'Brazil',
 'USA',
 'European Community',
 'Malta',
 'RSA']

In [36]:
# View distribution của unique countries - theo absolute count
df["Country"].value_counts()

Country
United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58


In [37]:
# View distribution của unique countries - theo percentage
df["Country"].value_counts(normalize=True) * 100

Country
United Kingdom          91.431956
Germany                  1.752139
France                   1.579047
EIRE                     1.512431
Spain                    0.467422
Netherlands              0.437527
Belgium                  0.381798
Switzerland              0.369435
Portugal                 0.280305
Australia                0.232327
Norway                   0.200403
Italy                    0.148180
Channel Islands          0.139876
Finland                  0.128250
Cyprus                   0.114779
Sweden                   0.085254
Unspecified              0.082302
Austria                  0.073998
Denmark                  0.071783
Japan                    0.066063
Poland                   0.062926
Israel                   0.054806
USA                      0.053699
Hong Kong                0.053145
Singapore                0.042258
Iceland                  0.033585
Canada                   0.027864
Greece                   0.026942
Malta                    0.023436
United

- Các thao tác với cột dạng string

In [38]:
# Upper cột Country
df["Country"].str.upper()

0         UNITED KINGDOM
1         UNITED KINGDOM
2         UNITED KINGDOM
3         UNITED KINGDOM
4         UNITED KINGDOM
               ...      
541904            FRANCE
541905            FRANCE
541906            FRANCE
541907            FRANCE
541908            FRANCE
Name: Country, Length: 541909, dtype: object

In [39]:
# Lower cột Country
df["Country"].str.lower()

0         united kingdom
1         united kingdom
2         united kingdom
3         united kingdom
4         united kingdom
               ...      
541904            france
541905            france
541906            france
541907            france
541908            france
Name: Country, Length: 541909, dtype: object

In [40]:
# Chaining
df["Country"].str.strip().str.upper()

0         UNITED KINGDOM
1         UNITED KINGDOM
2         UNITED KINGDOM
3         UNITED KINGDOM
4         UNITED KINGDOM
               ...      
541904            FRANCE
541905            FRANCE
541906            FRANCE
541907            FRANCE
541908            FRANCE
Name: Country, Length: 541909, dtype: object

In [41]:
# Lọc ra những item có description chứa từ 'cake' (ko quan tâm hoa thường)
# Note: use comparison
cond = df["Description"].str.lower().str.contains("cake") == True
df["Description"].loc[cond]

96            PACK OF 72 RETROSPOT CAKE CASES
97             PACK OF 60 DINOSAUR CAKE CASES
98         PACK OF 60 PINK PAISLEY CAKE CASES
99                60 TEATIME FAIRY CAKE CASES
129            CERAMIC CHERRY CAKE MONEY BANK
                         ...                 
541754             SET OF 3 REGENCY CAKE TINS
541826      SET OF 12 FAIRY CAKE BAKING CASES
541851          SET OF 3 CAKE TINS SKETCHBOOK
541890    LARGE CAKE STAND  HANGING STRAWBERY
541892          RED RETROSPOT ROUND CAKE TINS
Name: Description, Length: 26063, dtype: object

#### D) Thao tác với hàng (rows)

- Trước hết, inspect lại `df`

In [42]:
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,GoodType,Year,Month,Revenue,Date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6000,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,Regular,2010,12,15300.0,1
1,536365,71053,WHITE METAL LANTERN,6000,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,Regular,2010,12,20340.0,1


In [43]:
df["Date"].unique().tolist()

[1,
 2,
 3,
 5,
 6,
 7,
 8,
 9,
 10,
 12,
 13,
 14,
 15,
 16,
 17,
 19,
 20,
 21,
 22,
 23,
 4,
 11,
 18,
 24,
 25,
 26,
 27,
 28,
 30,
 31,
 29]

- Lọc ra tất cả các hóa đơn trong ngày 10 đến 20 của tháng 7 năm 2011
- Có bao nhiêu dòng như vậy?

In [44]:
# Cách 1
df2 = df.loc[(df["Year"] == 2011) & (df["Month"] == 7) & (df["Date"] >= 10) & (df["Date"] <= 20), :]
df2.shape

(15654, 13)

In [45]:
# Cách 2 (recommended)
cond = (df["Year"] == 2011) & \
    (df["Month"] == 7) & \
    (df["Date"] >= 10) & \
    (df["Date"] <= 20)

df2 = df.loc[cond, :]
df2.shape

(15654, 13)

- Lọc ra những sản phẩm description có chứa `'STRAWBERY'`

In [47]:
cond = df["Description"].str.upper().str.contains("STRAWBERY") == True
df2 = df.loc[cond, :]
df2.shape

(396, 13)

- Có bao nhiêu hóa đơn không có thông tin khách hàng

In [48]:
df2 = df.loc[df["CustomerID"].isnull(), :]
df2.shape

(135080, 13)

- Có bao nhiêu hóa đơn có thông tin khách hàng

In [49]:
df2 = df.loc[df["CustomerID"].notnull(), :]
df2.shape

(406829, 13)

#### E) Thao tác hàng và cột cùng lúc

- Lọc ra những đơn hàng có quantity `>= 1,000,000` và chỉ giữ lại cột `InvoiceDate`, `Quantity`, `UnitPrice`

In [51]:
# Cách 1: break down
cond = df["Quantity"] >= 1e6
cols = ["InvoiceDate", "Quantity", "UnitPrice"]

df.loc[cond, cols]

Unnamed: 0,InvoiceDate,Quantity,UnitPrice
4850,2010-12-02 16:48:00,1824000,0.55
4945,2010-12-02 17:38:00,2880000,0.18
4946,2010-12-02 17:38:00,1400000,1.06
6365,2010-12-03 11:48:00,1440000,0.16
16435,2010-12-07 16:43:00,1008000,2.31
...,...,...,...
533812,2011-12-07 12:20:00,1404000,2.75
534952,2011-12-07 15:16:00,1440000,1.79
540070,2011-12-08 18:45:00,1500000,0.72
540071,2011-12-08 18:46:00,1200000,0.72


In [52]:
# Cách 2: Viết gộp (không nên, khó debug)
df.loc[df["Quantity"] >= 1e6, ["Quantity", "UnitPrice"]]

Unnamed: 0,Quantity,UnitPrice
4850,1824000,0.55
4945,2880000,0.18
4946,1400000,1.06
6365,1440000,0.16
16435,1008000,2.31
...,...,...
533812,1404000,2.75
534952,1440000,1.79
540070,1500000,0.72
540071,1200000,0.72


### 1.3. Save cleanned data

In [None]:
df.head()

In [55]:
data_output_path = 'D:\DA\PhantichdulieuPython\data_output\\'
data_output_name = 'output_OnlineRetail.csv'

df.to_csv(data_output_path + data_output_name, index = False)