# Data Visualization with Modern Data Science

> Advanced Wrangling with Pandas

Yao-Jen Kuo <yaojenkuo@ntu.edu.tw> from [DATAINPOINT](https://www.datainpoint.com/)

In [1]:
import sqlite3
import pandas as pd

## Advanced Wrangling

In [2]:
excel_file_path = "data/總統-A05-4-候選人得票數一覽表-各投開票所(臺北市).xlsx"
presidents_taipei = pd.read_excel(excel_file_path, skiprows=[0, 1, 3, 4], thousands=",")

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
presidents_taipei

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,(1)\n柯文哲\n吳欣盈,(2)\n賴清德\n蕭美琴,(3)\n侯友宜\n趙少康,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,總　計,,,366854,587899,587258,1542011,10581,1552592,99,1552691,537371,2090062,74.28
1,北投區,,,35975,61151,51657,148783,1091,149874,6,149880,51950,201830,74.26
2,,建民里,1.0,208,401,311,920,6,926,0,926,283,1209,76.59
3,,建民里,2.0,209,455,272,936,3,939,0,939,279,1218,77.09
4,,建民里,3.0,221,439,306,966,11,977,0,977,263,1240,78.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772,,政大里,1726.0,208,313,477,998,10,1008,0,1008,337,1345,74.94
1773,,指南里,1727.0,217,323,409,949,7,956,0,956,335,1291,74.05
1774,,指南里,1728.0,279,314,378,971,5,976,0,976,353,1329,73.44
1775,,指南里,1729.0,156,286,144,586,6,592,0,592,216,808,73.27


In [4]:
presidents_taipei = presidents_taipei.iloc[:, 0:6]
presidents_taipei

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,(1)\n柯文哲\n吳欣盈,(2)\n賴清德\n蕭美琴,(3)\n侯友宜\n趙少康
0,總　計,,,366854,587899,587258
1,北投區,,,35975,61151,51657
2,,建民里,1.0,208,401,311
3,,建民里,2.0,209,455,272
4,,建民里,3.0,221,439,306
...,...,...,...,...,...,...
1772,,政大里,1726.0,208,313,477
1773,,指南里,1727.0,217,323,409
1774,,指南里,1728.0,279,314,378
1775,,指南里,1729.0,156,286,144


In [5]:
rename_columns = {
    "Unnamed: 0": "town",
    "Unnamed: 1": "village",
    "Unnamed: 2": "office"
}
presidents_taipei = presidents_taipei.rename(columns=rename_columns)
presidents_taipei

Unnamed: 0,town,village,office,(1)\n柯文哲\n吳欣盈,(2)\n賴清德\n蕭美琴,(3)\n侯友宜\n趙少康
0,總　計,,,366854,587899,587258
1,北投區,,,35975,61151,51657
2,,建民里,1.0,208,401,311
3,,建民里,2.0,209,455,272
4,,建民里,3.0,221,439,306
...,...,...,...,...,...,...
1772,,政大里,1726.0,208,313,477
1773,,指南里,1727.0,217,323,409
1774,,指南里,1728.0,279,314,378
1775,,指南里,1729.0,156,286,144


## Advanced wrangling including

- Dealing with missing values.
- Dealing with text values.
- Reshaping dataframes.
- Concatenating, merging and joining dataframes.

## Dealing with missing values

- Using `isnull` or `notnull` to check if `np.nan` exists.
- Using `dropna` to drop rows with `np.nan`.
- Using `fillna` to fill `np.nan` with specific values.

In [6]:
print(presidents_taipei.iloc[:, 0].size)
print(presidents_taipei.iloc[:, 0].isnull().sum())
print(presidents_taipei.iloc[:, 0].notnull().sum())

1777
1764
13


In [7]:
presidents_taipei["town"].fillna(method="ffill")

0        總　計
1       　北投區
2       　北投區
3       　北投區
4       　北投區
        ... 
1772    　文山區
1773    　文山區
1774    　文山區
1775    　文山區
1776    　文山區
Name: town, Length: 1777, dtype: object

In [8]:
presidents_taipei["town"] = presidents_taipei["town"].fillna(method="ffill")
presidents_taipei = presidents_taipei.dropna()
presidents_taipei

Unnamed: 0,town,village,office,(1)\n柯文哲\n吳欣盈,(2)\n賴清德\n蕭美琴,(3)\n侯友宜\n趙少康
2,北投區,建民里,1.0,208,401,311
3,北投區,建民里,2.0,209,455,272
4,北投區,建民里,3.0,221,439,306
5,北投區,文林里,4.0,181,396,282
6,北投區,文林里,5.0,206,445,299
...,...,...,...,...,...,...
1772,文山區,政大里,1726.0,208,313,477
1773,文山區,指南里,1727.0,217,323,409
1774,文山區,指南里,1728.0,279,314,378
1775,文山區,指南里,1729.0,156,286,144


## Summarizing text columns

- `unique`
- `nunique`
- `value_counts`

In [9]:
print(presidents_taipei['town'].nunique())
print(presidents_taipei['town'].unique())

12
['\u3000北投區' '\u3000士林區' '\u3000大同區' '\u3000中山區' '\u3000松山區' '\u3000內湖區'
 '\u3000南港區' '\u3000萬華區' '\u3000中正區' '\u3000大安區' '\u3000信義區' '\u3000文山區']


In [10]:
presidents_taipei['town'].value_counts()

town
　大安區    194
　士林區    187
　文山區    175
　北投區    169
　信義區    168
　內湖區    165
　中山區    152
　萬華區    137
　松山區    134
　中正區    116
　大同區     88
　南港區     79
Name: count, dtype: int64

## Splitting strings with `str.split` as a `Series`

In [11]:
candidates_info = pd.Series(presidents_taipei.columns[-3:])
candidates_info

0    (1)\n柯文哲\n吳欣盈
1    (2)\n賴清德\n蕭美琴
2    (3)\n侯友宜\n趙少康
dtype: object

In [12]:
candidates_info.str.split()

0    [(1), 柯文哲, 吳欣盈]
1    [(2), 賴清德, 蕭美琴]
2    [(3), 侯友宜, 趙少康]
dtype: object

## Replacing strings with `str.replace`

In [13]:
presidents_taipei['town'] = presidents_taipei['town'].str.replace("\u3000", "")
presidents_taipei['town'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  presidents_taipei['town'] = presidents_taipei['town'].str.replace("\u3000", "")


array(['北投區', '士林區', '大同區', '中山區', '松山區', '內湖區', '南港區', '萬華區', '中正區',
       '大安區', '信義區', '文山區'], dtype=object)

In [14]:
candidates_info.str.replace("(", "").str.replace(")", "")

0    1\n柯文哲\n吳欣盈
1    2\n賴清德\n蕭美琴
2    3\n侯友宜\n趙少康
dtype: object

## Testing for strings that match or contain a pattern with `str.contains`

In [15]:
contains_da = presidents_taipei['town'].str.contains("大")
presidents_taipei['town'][contains_da].unique()

array(['大同區', '大安區'], dtype=object)

## Two formats of a table

- Wide: data is presented with each different data variable in a separate column.
- Long: data is presented with one column containing all the values and another column listing the context of the value.

Source: <https://en.wikipedia.org/wiki/Wide_and_narrow_data>

## Wide, or unstacked data

In [16]:
request_url = "https://en.wikipedia.org/wiki/Wide_and_narrow_data"
wide_format = pd.read_html(request_url)[0]
wide_format

Unnamed: 0,Person,Age,Weight,Height
0,Bob,32,168,180
1,Alice,24,150,175
2,Steve,64,144,165


## Narrow, stacked, or long data

In [17]:
request_url = "https://en.wikipedia.org/wiki/Wide_and_narrow_data"
long_format = pd.read_html(request_url)[1]
long_format

Unnamed: 0,Person,Variable,Value
0,Bob,Age,32
1,Bob,Weight,168
2,Bob,Height,180
3,Alice,Age,24
4,Alice,Weight,150
5,Alice,Height,175
6,Steve,Age,64
7,Steve,Weight,144
8,Steve,Height,165


## Reshaping dataframes from wide to long format with `pd.melt()`

In [18]:
pd.melt(wide_format, 
        id_vars="Person", value_vars=["Age", "Weight", "Height"],
        var_name="Variable", value_name="Value")

Unnamed: 0,Person,Variable,Value
0,Bob,Age,32
1,Alice,Age,24
2,Steve,Age,64
3,Bob,Weight,168
4,Alice,Weight,150
5,Steve,Weight,144
6,Bob,Height,180
7,Alice,Height,175
8,Steve,Height,165


## Reshaping dataframes from long to wide format with `pivot()`

In [19]:
long_format.pivot(index="Person", columns="Variable", values="Value").reset_index().rename_axis(None, axis=1)

Unnamed: 0,Person,Age,Height,Weight
0,Alice,24,175,150
1,Bob,32,180,168
2,Steve,64,165,144


## A common problem is that a dataset where some of the column names are not names of variables, but values of a variable

In [20]:
presidents_taipei

Unnamed: 0,town,village,office,(1)\n柯文哲\n吳欣盈,(2)\n賴清德\n蕭美琴,(3)\n侯友宜\n趙少康
2,北投區,建民里,1.0,208,401,311
3,北投區,建民里,2.0,209,455,272
4,北投區,建民里,3.0,221,439,306
5,北投區,文林里,4.0,181,396,282
6,北投區,文林里,5.0,206,445,299
...,...,...,...,...,...,...
1772,文山區,政大里,1726.0,208,313,477
1773,文山區,指南里,1727.0,217,323,409
1774,文山區,指南里,1728.0,279,314,378
1775,文山區,指南里,1729.0,156,286,144


## We can pivot the columns into a new pair of variables

To describe that operation we need four parameters:

- The set of columns whose names are not values
- The set of columns whose names are values
- The name of the variable to move the column names to
- The name of the variable to move the column values to

## In this example, the four parameters are

- `id_vars`: `["town", "village", "office"]`
- `value_vars`: The columns except `id_vars`
- `var_name`: We can name it `candidate_info`
- `value_name`: Let's name it `votes`

In [21]:
idVars = ["town", "village", "office"]
presidents_taipei_long = pd.melt(presidents_taipei,
                                 id_vars=idVars,
                                 var_name="candidate_info",
                                 value_name="votes")
presidents_taipei_long

Unnamed: 0,town,village,office,candidate_info,votes
0,北投區,建民里,1.0,(1)\n柯文哲\n吳欣盈,208
1,北投區,建民里,2.0,(1)\n柯文哲\n吳欣盈,209
2,北投區,建民里,3.0,(1)\n柯文哲\n吳欣盈,221
3,北投區,文林里,4.0,(1)\n柯文哲\n吳欣盈,181
4,北投區,文林里,5.0,(1)\n柯文哲\n吳欣盈,206
...,...,...,...,...,...
5287,文山區,政大里,1726.0,(3)\n侯友宜\n趙少康,477
5288,文山區,指南里,1727.0,(3)\n侯友宜\n趙少康,409
5289,文山區,指南里,1728.0,(3)\n侯友宜\n趙少康,378
5290,文山區,指南里,1729.0,(3)\n侯友宜\n趙少康,144


## Concatenating, merging and joining dataframes

- `concat()` vertically or horizontally.
- `merge()` horizontally on column names.
- `join()` horizontally on index.

In [22]:
db_file_path = "data/taiwan_election_2024.db"
conn = sqlite3.connect(db_file_path)
presidents = pd.read_sql("""SELECT * FROM presidents;""", conn)
candidates = pd.read_sql("""SELECT * FROM candidates;""", conn)

## Use `concat()` function to concatenate vertically or horizontally

In [23]:
pd.concat((candidates.loc[:328, :], candidates.loc[328:, :]))

Unnamed: 0,id,name,party_id,election_type_id
0,1,丁學忠,1,2
1,2,傅崐萁,1,2
2,3,吳政杰,1,2
3,4,呂玉玲,1,2
4,5,周宏昌,1,2
...,...,...,...,...
327,328,高美珠,30,4
328,329,侯友宜/趙少康,1,1
328,329,侯友宜/趙少康,1,1
329,330,柯文哲/吳欣盈,15,1


In [24]:
pd.concat((candidates[["id"]], candidates[["name"]]), axis=1)

Unnamed: 0,id,name
0,1,丁學忠
1,2,傅崐萁
2,3,吳政杰
3,4,呂玉玲
4,5,周宏昌
...,...,...
326,327,陳政宗
327,328,高美珠
328,329,侯友宜/趙少康
329,330,柯文哲/吳欣盈


## Using `merge` function to join dataframes on columns

In [25]:
# default: inner join
pd.merge(presidents, candidates, left_on="candidate_id", right_on="id")

Unnamed: 0,id_x,number,district_id,candidate_id,votes,election_type_id_x,village_id,id_y,name,party_id,election_type_id_y
0,1,1,15035,330,146,1,1,330,柯文哲/吳欣盈,15,1
1,2,1,15036,330,128,1,1,330,柯文哲/吳欣盈,15,1
2,3,1,15037,330,239,1,2,330,柯文哲/吳欣盈,15,1
3,4,1,15038,330,208,1,3,330,柯文哲/吳欣盈,15,1
4,5,1,15039,330,210,1,4,330,柯文哲/吳欣盈,15,1
...,...,...,...,...,...,...,...,...,...,...,...
53380,53381,3,7914,329,192,1,794,329,侯友宜/趙少康,1,1
53381,53382,3,7915,329,268,1,5107,329,侯友宜/趙少康,1,1
53382,53383,3,7916,329,224,1,5108,329,侯友宜/趙少康,1,1
53383,53384,3,7917,329,238,1,484,329,侯友宜/趙少康,1,1


In [26]:
# left join
pd.merge(presidents[presidents["number"] == 2], candidates, left_on="candidate_id", right_on="id", how="left")

Unnamed: 0,id_x,number,district_id,candidate_id,votes,election_type_id_x,village_id,id_y,name,party_id,election_type_id_y
0,11,2,15035,331,56,1,1,331,賴清德/蕭美琴,29,1
1,12,2,15036,331,67,1,1,331,賴清德/蕭美琴,29,1
2,13,2,15037,331,103,1,2,331,賴清德/蕭美琴,29,1
3,14,2,15038,331,82,1,3,331,賴清德/蕭美琴,29,1
4,15,2,15039,331,84,1,4,331,賴清德/蕭美琴,29,1
...,...,...,...,...,...,...,...,...,...,...,...
17790,51997,2,7914,331,83,1,794,331,賴清德/蕭美琴,29,1
17791,51998,2,7915,331,58,1,5107,331,賴清德/蕭美琴,29,1
17792,51999,2,7916,331,60,1,5108,331,賴清德/蕭美琴,29,1
17793,52000,2,7917,331,48,1,484,331,賴清德/蕭美琴,29,1


In [27]:
# right join
pd.merge(presidents, candidates, left_on="candidate_id", right_on="id", how="right")

Unnamed: 0,id_x,number,district_id,candidate_id,votes,election_type_id_x,village_id,id_y,name,party_id,election_type_id_y
0,,,,,,,,1,丁學忠,1,2
1,,,,,,,,2,傅崐萁,1,2
2,,,,,,,,3,吳政杰,1,2
3,,,,,,,,4,呂玉玲,1,2
4,,,,,,,,5,周宏昌,1,2
...,...,...,...,...,...,...,...,...,...,...,...
53708,51997.0,2.0,7914.0,331.0,83.0,1.0,794.0,331,賴清德/蕭美琴,29,1
53709,51998.0,2.0,7915.0,331.0,58.0,1.0,5107.0,331,賴清德/蕭美琴,29,1
53710,51999.0,2.0,7916.0,331.0,60.0,1.0,5108.0,331,賴清德/蕭美琴,29,1
53711,52000.0,2.0,7917.0,331.0,48.0,1.0,484.0,331,賴清德/蕭美琴,29,1


## Using `join` method to join dataframes on index

In [28]:
# join dataframes on index
# default: left join
presidents[presidents["number"] == 2].set_index("candidate_id").join(candidates.set_index("id"), lsuffix='_x', rsuffix='_y')

Unnamed: 0,id,number,district_id,votes,election_type_id_x,village_id,name,party_id,election_type_id_y
331,11,2,15035,56,1,1,賴清德/蕭美琴,29,1
331,12,2,15036,67,1,1,賴清德/蕭美琴,29,1
331,13,2,15037,103,1,2,賴清德/蕭美琴,29,1
331,14,2,15038,82,1,3,賴清德/蕭美琴,29,1
331,15,2,15039,84,1,4,賴清德/蕭美琴,29,1
...,...,...,...,...,...,...,...,...,...
331,51997,2,7914,83,1,794,賴清德/蕭美琴,29,1
331,51998,2,7915,58,1,5107,賴清德/蕭美琴,29,1
331,51999,2,7916,60,1,5108,賴清德/蕭美琴,29,1
331,52000,2,7917,48,1,484,賴清德/蕭美琴,29,1


In [29]:
# inner join
presidents.set_index("candidate_id").join(candidates.set_index("id"), lsuffix='_x', rsuffix='_y', how="inner")

Unnamed: 0_level_0,id,number,district_id,votes,election_type_id_x,village_id,name,party_id,election_type_id_y
candidate_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
329,21,3,15035,414,1,1,侯友宜/趙少康,1,1
329,22,3,15036,341,1,1,侯友宜/趙少康,1,1
329,23,3,15037,612,1,2,侯友宜/趙少康,1,1
329,24,3,15038,398,1,3,侯友宜/趙少康,1,1
329,25,3,15039,566,1,4,侯友宜/趙少康,1,1
...,...,...,...,...,...,...,...,...,...
331,51997,2,7914,83,1,794,賴清德/蕭美琴,29,1
331,51998,2,7915,58,1,5107,賴清德/蕭美琴,29,1
331,51999,2,7916,60,1,5108,賴清德/蕭美琴,29,1
331,52000,2,7917,48,1,484,賴清德/蕭美琴,29,1


In [30]:
# right join
presidents.set_index("candidate_id").join(candidates.set_index("id"), lsuffix='_x', rsuffix='_y', how="right")

Unnamed: 0_level_0,id,number,district_id,votes,election_type_id_x,village_id,name,party_id,election_type_id_y
candidate_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,,,,,,,丁學忠,1,2
2,,,,,,,傅崐萁,1,2
3,,,,,,,吳政杰,1,2
4,,,,,,,呂玉玲,1,2
5,,,,,,,周宏昌,1,2
...,...,...,...,...,...,...,...,...,...
331,51997.0,2.0,7914.0,83.0,1.0,794.0,賴清德/蕭美琴,29,1
331,51998.0,2.0,7915.0,58.0,1.0,5107.0,賴清德/蕭美琴,29,1
331,51999.0,2.0,7916.0,60.0,1.0,5108.0,賴清德/蕭美琴,29,1
331,52000.0,2.0,7917.0,48.0,1.0,484.0,賴清德/蕭美琴,29,1
