# Data Visualization with Modern Data Science

> Data Wrangling with Pandas

Yao-Jen Kuo <yaojenkuo@ntu.edu.tw> from [DATAINPOINT](https://www.datainpoint.com/)

In [1]:
import sqlite3
import pandas as pd

## Basic attributes and methods

In [2]:
db_file_path = "data/taiwan_election_2024.db"
conn = sqlite3.connect(db_file_path)
presidents = pd.read_sql("""SELECT * FROM presidents;""", conn)
type(presidents)

pandas.core.frame.DataFrame

## Basic attributes of a `DataFrame` object

- `shape`
- `dtypes`
- `index`
- `columns`

In [3]:
print(presidents.shape)
print(presidents.dtypes)
print(presidents.index)
print(presidents.columns)

(53385, 7)
id                  int64
number              int64
district_id         int64
candidate_id        int64
votes               int64
election_type_id    int64
village_id          int64
dtype: object
RangeIndex(start=0, stop=53385, step=1)
Index(['id', 'number', 'district_id', 'candidate_id', 'votes',
       'election_type_id', 'village_id'],
      dtype='object')


## Basic methods of a `DataFrame` object

- `head(n)`
- `tail(n)`
- `describe()`
- `info()`
- `set_index()`
- `reset_index()`

## `head(n)` returns the top n observations with header

In [4]:
presidents.head() # n is default to 5

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
0,1,1,15035,330,146,1,1
1,2,1,15036,330,128,1,1
2,3,1,15037,330,239,1,2
3,4,1,15038,330,208,1,3
4,5,1,15039,330,210,1,4


## `tail(n)` returns the bottom n observations with header

In [5]:
presidents.tail(3)

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
53382,53383,3,7916,329,224,1,5108
53383,53384,3,7917,329,238,1,484
53384,53385,3,7918,329,364,1,5109


## `describe()` returns the descriptive summary for numeric columns

In [6]:
presidents.describe()

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
count,53385.0,53385.0,53385.0,53385.0,53385.0,53385.0,53385.0
mean,26693.0,2.0,8898.0,330.0,261.262639,1.0,2161.525541
std,15411.066397,0.816504,5137.022125,0.816504,109.691107,0.0,1453.253065
min,1.0,1.0,1.0,329.0,1.0,1.0,1.0
25%,13347.0,1.0,4449.0,329.0,184.0,1.0,904.0
50%,26693.0,2.0,8898.0,330.0,256.0,1.0,1866.0
75%,40039.0,3.0,13347.0,331.0,333.0,1.0,3321.0
max,53385.0,3.0,17795.0,331.0,890.0,1.0,5109.0


## `info()` returns the concise information of the dataframe

In [7]:
presidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53385 entries, 0 to 53384
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   id                53385 non-null  int64
 1   number            53385 non-null  int64
 2   district_id       53385 non-null  int64
 3   candidate_id      53385 non-null  int64
 4   votes             53385 non-null  int64
 5   election_type_id  53385 non-null  int64
 6   village_id        53385 non-null  int64
dtypes: int64(7)
memory usage: 2.9 MB


## `set_index()` replaces current `Index` with a specific variable

In [8]:
presidents.set_index("id")

Unnamed: 0_level_0,number,district_id,candidate_id,votes,election_type_id,village_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,15035,330,146,1,1
2,1,15036,330,128,1,1
3,1,15037,330,239,1,2
4,1,15038,330,208,1,3
5,1,15039,330,210,1,4
...,...,...,...,...,...,...
53381,3,7914,329,192,1,794
53382,3,7915,329,268,1,5107
53383,3,7916,329,224,1,5108
53384,3,7917,329,238,1,484


## `reset_index()` resets current `Index` with default `RangeIndex` 

In [9]:
presidents.set_index("id").reset_index()

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
0,1,1,15035,330,146,1,1
1,2,1,15036,330,128,1,1
2,3,1,15037,330,239,1,2
3,4,1,15038,330,208,1,3
4,5,1,15039,330,210,1,4
...,...,...,...,...,...,...,...
53380,53381,3,7914,329,192,1,794
53381,53382,3,7915,329,268,1,5107
53382,53383,3,7916,329,224,1,5108
53383,53384,3,7917,329,238,1,484


## Basic Wrangling

## Basic wrangling is like writing SQL queries

- Selecting: `SELECT FROM`
- Filtering: `WHERE`
- Subset: `SELECT FROM WHERE`

## Basic wrangling is like writing SQL queries(cont'd)

- Indexing
- Sorting: `ORDER BY`
- Deriving
- Summarizing
- Summarizing and Grouping: `GROUP BY`

## Selecting a column as `Series`

In [10]:
print(presidents["votes"])
print(type(presidents["votes"]))

0        146
1        128
2        239
3        208
4        210
        ... 
53380    192
53381    268
53382    224
53383    238
53384    364
Name: votes, Length: 53385, dtype: int64
<class 'pandas.core.series.Series'>


## Selecting a column as `DataFrame`

In [11]:
print(type(presidents[["votes"]]))
presidents[["votes"]]

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,votes
0,146
1,128
2,239
3,208
4,210
...,...
53380,192
53381,268
53382,224
53383,238


## Selecting multiple columns as `DataFrame`, for sure

In [12]:
presidents[["number", "votes"]]

Unnamed: 0,number,votes
0,1,146
1,1,128
2,1,239
3,1,208
4,1,210
...,...,...
53380,3,192
53381,3,268
53382,3,224
53383,3,238


## Filtering rows with conditional statements

In [13]:
number_two = presidents["number"] == 2
presidents[number_two]

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
10,11,2,15035,331,56,1,1
11,12,2,15036,331,67,1,1
12,13,2,15037,331,103,1,2
13,14,2,15038,331,82,1,3
14,15,2,15039,331,84,1,4
...,...,...,...,...,...,...,...
51996,51997,2,7914,331,83,1,794
51997,51998,2,7915,331,58,1,5107
51998,51999,2,7916,331,60,1,5108
51999,52000,2,7917,331,48,1,484


## Subsetting columns and rows simultaneously

In [14]:
cols_to_select = ["number", "votes"]
rows_to_filter = presidents["number"] == 2
presidents[rows_to_filter][cols_to_select]
presidents[cols_to_select][rows_to_filter]

Unnamed: 0,number,votes
10,2,56
11,2,67
12,2,103
13,2,82
14,2,84
...,...,...
51996,2,83
51997,2,58
51998,2,60
51999,2,48


## Indexing `DataFrame` with

- `loc[]`
- `iloc[]`

## `loc[]` is indexing `DataFrame` with row/column `Index` 

In [15]:
presidents.loc[:, ["number", "votes"]]
presidents.loc[[0, 1], :]
presidents.loc[[0, 1], ["number", "votes"]]

Unnamed: 0,number,votes
0,1,146
1,1,128


## `iloc[]` is indexing `DataFrame` with integer location

In [16]:
presidents.iloc[:, [1, 4]]
presidents.iloc[[0, 1], :]
presidents.iloc[[0, 1], [1, 4]]

Unnamed: 0,number,votes
0,1,146
1,1,128


## Sorting `DataFrame` with

- `sort_values`
- `sort_index`

## `sort_values` sorts `DataFrame` with specific columns

In [17]:
presidents.sort_values("number")
presidents.sort_values(["number", "district_id"], ascending=[False, True])

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
35739,35740,3,1,329,146,1,38
35740,35741,3,2,329,55,1,289
35741,35742,3,3,329,51,1,289
35742,35743,3,4,329,85,1,3780
35743,35744,3,5,329,85,1,3780
...,...,...,...,...,...,...,...
44212,44213,1,17791,330,129,1,4703
44213,44214,1,17792,330,126,1,2987
44214,44215,1,17793,330,137,1,4704
44215,44216,1,17794,330,113,1,4705


## `sort_index` sorts `DataFrame` with the `Index` of `DataFrame`

In [18]:
presidents.sort_index(ascending=False)

Unnamed: 0,id,number,district_id,candidate_id,votes,election_type_id,village_id
53384,53385,3,7918,329,364,1,5109
53383,53384,3,7917,329,238,1,484
53382,53383,3,7916,329,224,1,5108
53381,53382,3,7915,329,268,1,5107
53380,53381,3,7914,329,192,1,794
...,...,...,...,...,...,...,...
4,5,1,15039,330,210,1,4
3,4,1,15038,330,208,1,3
2,3,1,15037,330,239,1,2
1,2,1,15036,330,128,1,1


## Deriving new variables from `DataFrame`

- Simple operations
- `map` with a `dict`
- `map` with a function

## Deriving new variable with simple operations

In [19]:
presidents["votes"] >= 200

0        False
1        False
2         True
3         True
4         True
         ...  
53380    False
53381     True
53382     True
53383     True
53384     True
Name: votes, Length: 53385, dtype: bool

## Deriving categorical from categorical with `map`

- Passing a `dict`
- Passing a function

In [20]:
# Passing a dict
candidate_names = {
    1: "柯文哲/吳欣盈",
    2: "賴清德/蕭美琴",
    3: "侯友宜/趙少康"
}
presidents["number"].map(candidate_names)

0        柯文哲/吳欣盈
1        柯文哲/吳欣盈
2        柯文哲/吳欣盈
3        柯文哲/吳欣盈
4        柯文哲/吳欣盈
          ...   
53380    侯友宜/趙少康
53381    侯友宜/趙少康
53382    侯友宜/趙少康
53383    侯友宜/趙少康
53384    侯友宜/趙少康
Name: number, Length: 53385, dtype: object

In [21]:
# Passing a function
def map_number_to_name(x):
    if x == 1:
        return "柯文哲/吳欣盈"
    elif x == 2:
        return "賴清德/蕭美琴"
    else:
        return "侯友宜/趙少康"

presidents["number"].map(map_number_to_name)

0        柯文哲/吳欣盈
1        柯文哲/吳欣盈
2        柯文哲/吳欣盈
3        柯文哲/吳欣盈
4        柯文哲/吳欣盈
          ...   
53380    侯友宜/趙少康
53381    侯友宜/趙少康
53382    侯友宜/趙少康
53383    侯友宜/趙少康
53384    侯友宜/趙少康
Name: number, Length: 53385, dtype: object

## Summarizing `DataFrame` with aggregate methods

In [22]:
presidents["votes"].sum()

13947506

## Summarizing and grouping `DataFrame` with aggregate methods

In [23]:
presidents.groupby("number")["votes"].sum()

number
1    3690466
2    5586019
3    4671021
Name: votes, dtype: int64