In [6]:
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/CDC_python_chunyi/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 資料處理

In [7]:
import pandas as pd
import os

## 資料讀取
### 讀取方式

In [9]:
txt_data = pd.read_csv(os.path.join(data_path, "demo1.txt"), sep='\t')
csv_data = pd.read_csv(os.path.join(data_path, "demo2.csv"))

## 資料清洗
### 變數結構

In [None]:
# 資料總數
txt_data.size

# 資料維度資訊
txt_data.ndim # 維度數目
txt_data.shape # 維度長度

# 變數型態
txt_data.dtypes

# 資料檢視
txt_data.describe
txt_data.info

### 遺失值處理


In [None]:
txt_data_missing = pd.concat([txt_data, pd.DataFrame({'ID': [99], 'ListID':['List5'], 'Hearing':[pd.NA]})])

In [None]:
# 尋找遺失值
txt_data_missing.isnull()

# 遺失值數量統計
txt_data_missing.isnull().sum()

In [None]:
# 遺失值填補為 0
txt_data_clean = txt_data_missing.fillna(0)

# 遺失值填補平均值
hearing_maen = txt_data_missing['Hearing'].mean()
txt_data_clean = txt_data_missing.fillna(hearing_maen)

# 刪除 NA 所在之觀察值
txt_data_clean = txt_data_missing.dropna()

### 重複值處理

In [None]:
txt_data1 = txt_data.drop_duplicates()

### 異常值檢測

In [None]:
#####
# 標準差
#####
txt_data.std()

#####
# 四分位距(IQR)
#####
# 自定義IQR程式碼
def fetch_IQR(data:pd.DataFrame, col:str):
    return data[col].quantile(.75)-data[col].quantile(.25)

# txt_data 中 Hearing 的 IQR
fetch_IQR(data = txt_data, col='Hearing')

## 資料轉換

### 資料正規化/標準化

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
txt_data_trans = txt_data.copy()

# 資料正規化
x_scale_norm = txt_data_trans.loc[:,['Hearing']]
x_scale_norm = MinMaxScaler().fit_transform(x_scale_norm)

# 資料標準化
x_scale_std = txt_data_trans.loc[:,['Hearing']]
x_scale_std = StandardScaler().fit_transform(x_scale_std)

### 虛擬變數

In [None]:
txt_data_dummy = txt_data.copy()

# 將 ListID 轉化成虛擬變數
pd.get_dummies(txt_data_dummy, prefix=['ListID'])

# 資料分析
載入 LARS 資料

In [None]:
df_lars_data = pd.read_csv("data/LARS_flu.csv")
df_lars_data = df_lars_data.dropna()
df_lars_data

## 描述性分析

In [None]:
# 建立描述性分析所需的資料
def fetch_data_basic_info(data:pd.DataFrame, colname:str) -> dict:
    basic_info = {
        'Mean' : data[colname].mean(),
        'StdDev' : data[colname].std(),
        'Min' : data[colname].min(),
        'Q1' : data[colname].quantile(0.25),
        'Median' : data[colname].median(),
        'Q3' : data[colname].quantile(0.75),
        'Max' : data[colname].max()
    }
    return basic_info

# 呈現結果
print(fetch_data_basic_info(data=df_lars_data, colname='Parainfluenza'))

In [None]:
print(fetch_data_basic_info(data=df_lars_data, colname='Parainfluenza'))

## 樞紐分析表

In [None]:
df_piv_lars_data = df_lars_data.melt(id_vars='Year-Week of Specimen Received')
df_piv_lars_data

In [None]:
piv_lars = pd.pivot_table(
    data = df_piv_lars_data,
    index = 'Year-Week of Specimen Received',
    columns = 'variable',
    aggfunc = 'mean'
)
piv_lars