# Pandas Modifying Data

[Reference](https://www.youtube.com/watch?v=DCDe29sIKcE&ab_channel=CoreySchafer)

In [1]:
import pandas as pd
import random

In [2]:
df = pd.read_csv('../dataset/student_exam_data.csv')

In [3]:
# 範例: 把columns全部改成大寫
df.columns = [i.upper() for i in df.columns]
df.head()

Unnamed: 0,STUDY HOURS,PREVIOUS EXAM SCORE,PASS/FAIL
0,4.370861,81.889703,0
1,9.556429,72.165782,1
2,7.587945,58.571657,0
3,6.387926,88.827701,1
4,2.404168,81.08387,0


In [4]:
# 使用.str的functions進行文字修改
df.columns = df.columns.str.replace(' ', '_')  # 把空格改為底線
df.head()

Unnamed: 0,STUDY_HOURS,PREVIOUS_EXAM_SCORE,PASS/FAIL
0,4.370861,81.889703,0
1,9.556429,72.165782,1
2,7.587945,58.571657,0
3,6.387926,88.827701,1
4,2.404168,81.08387,0


In [5]:
df = pd.read_csv('../dataset/student_exam_data.csv')  # 重置

In [6]:
df.head()

Unnamed: 0,Study Hours,Previous Exam Score,Pass/Fail
0,4.370861,81.889703,0
1,9.556429,72.165782,1
2,7.587945,58.571657,0
3,6.387926,88.827701,1
4,2.404168,81.08387,0


In [7]:
# Change column name
df.rename(
    columns={
        'Study Hours': 'Study_Hours',
        'Previous Exam Score': 'Previous_Exam_Score'
    },
    inplace=True
)

In [8]:
df.head()

Unnamed: 0,Study_Hours,Previous_Exam_Score,Pass/Fail
0,4.370861,81.889703,0
1,9.556429,72.165782,1
2,7.587945,58.571657,0
3,6.387926,88.827701,1
4,2.404168,81.08387,0


In [9]:
df.loc[2]

Study_Hours             7.587945
Previous_Exam_Score    58.571657
Pass/Fail               0.000000
Name: 2, dtype: float64

In [10]:
# updated data
df.loc[2] = [random.uniform(1, 10), random.uniform(0, 100), 1]
df.loc[2]

Study_Hours             7.128414
Previous_Exam_Score    11.393271
Pass/Fail               1.000000
Name: 2, dtype: float64

In [11]:
df.loc[2]

Study_Hours             7.128414
Previous_Exam_Score    11.393271
Pass/Fail               1.000000
Name: 2, dtype: float64

In [12]:
# 最直接的修改方式, filtering欄位進行修改
df.loc[2, ['Study_Hours', 'Previous_Exam_Score']] = [random.uniform(1, 10), random.uniform(0, 100)]
df.loc[2]

Study_Hours             6.585229
Previous_Exam_Score    48.696410
Pass/Fail               1.000000
Name: 2, dtype: float64

In [13]:
# 用at方式
df.at[2, 'Study_Hours'] = random.uniform(1, 10)
df.loc[2]

Study_Hours             1.600156
Previous_Exam_Score    48.696410
Pass/Fail               1.000000
Name: 2, dtype: float64

In [14]:
# 結合filtering, 指定column修改數值
df = pd.read_csv(
    '../dataset/學生修課資料與成績.csv',
    encoding='utf-8-sig'
)
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績
0,110,1,A2372,資訊一甲,R53862,90
1,110,1,A4291,資訊一甲,R87963,71
2,110,1,A7024,資訊一甲,R93896,59
3,110,1,A9525,資訊一甲,R88116,66
4,110,1,A8406,材料二甲,R45288,6


In [15]:
df.loc[df['學號'] == 'A4291']

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績
1,110,1,A4291,資訊一甲,R87963,71


In [16]:
filt = (df['學號'] == 'A4291')

In [17]:
df.loc[filt, '學號'] = 'A2335'
df.loc[filt]

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績
1,110,1,A2335,資訊一甲,R87963,71


建立新欄位 -> 從另外一個column複製(修改)資料

In [18]:
df['就讀班級'].value_counts()

就讀班級
資訊一乙    8
材料二甲    6
資訊二甲    5
資訊一甲    4
化學三甲    3
Name: count, dtype: int64

※ 班級都位於字串中的第三個元素

In [19]:
df['就讀班級'].str[2].value_counts()

就讀班級
一    12
二    11
三     3
Name: count, dtype: int64

In [20]:
df['年級'] = df['就讀班級'].str[2]
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級
0,110,1,A2372,資訊一甲,R53862,90,一
1,110,1,A2335,資訊一甲,R87963,71,一
2,110,1,A7024,資訊一甲,R93896,59,一
3,110,1,A9525,資訊一甲,R88116,66,一
4,110,1,A8406,材料二甲,R45288,6,二


## Pandas - Apply

In [21]:
# 建立一個simple function -> 把學號的第一個字元改為'a'
def update_id(id: str):
    return id.lower()

In [22]:
df['學號'].apply(update_id).head()

0    a2372
1    a2335
2    a7024
3    a9525
4    a8406
Name: 學號, dtype: object

### Apply Function 實作練習

In [23]:
import re
import numpy as np

In [24]:
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級
0,110,1,A2372,資訊一甲,R53862,90,一
1,110,1,A2335,資訊一甲,R87963,71,一
2,110,1,A7024,資訊一甲,R93896,59,一
3,110,1,A9525,資訊一甲,R88116,66,一
4,110,1,A8406,材料二甲,R45288,6,二


In [25]:
df.drop(columns=['年級'], inplace=True)
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績
0,110,1,A2372,資訊一甲,R53862,90
1,110,1,A2335,資訊一甲,R87963,71
2,110,1,A7024,資訊一甲,R93896,59
3,110,1,A9525,資訊一甲,R88116,66
4,110,1,A8406,材料二甲,R45288,6


In [26]:
def grab_grade(clas: str):
    """
    使用re.search搜尋符合一到四年級的字串
    """
    return clas[re.search(r'[一二三四]', clas).span()[0]]

In [27]:
df['年級'] = df['就讀班級'].apply(grab_grade)
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級
0,110,1,A2372,資訊一甲,R53862,90,一
1,110,1,A2335,資訊一甲,R87963,71,一
2,110,1,A7024,資訊一甲,R93896,59,一
3,110,1,A9525,資訊一甲,R88116,66,一
4,110,1,A8406,材料二甲,R45288,6,二


In [28]:
# 加上lambda function, 在年級欄位中，顯示為大一、大二...格式
df['年級'] = df['年級'].apply(lambda x: '大' + x)
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級
0,110,1,A2372,資訊一甲,R53862,90,大一
1,110,1,A2335,資訊一甲,R87963,71,大一
2,110,1,A7024,資訊一甲,R93896,59,大一
3,110,1,A9525,資訊一甲,R88116,66,大一
4,110,1,A8406,材料二甲,R45288,6,大二


### Mean Absolute Deviation實作

[Reference](https://vocus.cc/article/6767df46fd897800010f0954)

In [29]:
# 計算每筆成績的Absolute Deviation
avg_score = df['成績'].mean()  # 先算出mean
# 用abs + np.round()計算每一筆資料的Absolute Deviation
df['Absolute Deviation'] = df['成績'].apply(lambda x: abs(np.round(x - avg_score, 2)))
df.head()

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級,Absolute Deviation
0,110,1,A2372,資訊一甲,R53862,90,大一,36.81
1,110,1,A2335,資訊一甲,R87963,71,大一,17.81
2,110,1,A7024,資訊一甲,R93896,59,大一,5.81
3,110,1,A9525,資訊一甲,R88116,66,大一,12.81
4,110,1,A8406,材料二甲,R45288,6,大二,47.19


In [30]:
# 用Absolute Deviation欄位計算MAD
mean_absolute_deviation = df['Absolute Deviation'].mean()
mean_absolute_deviation

np.float64(22.77846153846154)

In [31]:
# 篩選出大於Mean Absolute Deviation的資料
df.loc[df['Absolute Deviation'] > mean_absolute_deviation]

Unnamed: 0,學年度,學期,學號,就讀班級,課程代碼,成績,年級,Absolute Deviation
0,110,1,A2372,資訊一甲,R53862,90,大一,36.81
4,110,1,A8406,材料二甲,R45288,6,大二,47.19
7,110,1,A2950,資訊一乙,R31748,16,大一,37.19
8,110,1,A5486,資訊一乙,R45902,25,大一,28.19
9,110,1,A6658,資訊一乙,R38323,8,大一,45.19
10,110,1,A7583,化學三甲,R37639,77,大三,23.81
12,110,1,A4190,資訊二甲,R45944,83,大二,29.81
14,110,1,A5175,資訊二甲,R40624,83,大二,29.81
15,110,1,A8857,資訊二甲,R31315,76,大二,22.81
17,110,1,A8394,化學三甲,R20756,23,大三,30.19
