In [3]:
# 讀取 Adult 資料集，並以 pandas DataFrame 儲存
import pandas as pd

data = pd.read_csv("adult.data.csv")

In [4]:
# 看看資料有幾筆、幾個欄位
data.shape

(32561, 15)

In [5]:
# 設定欄位名稱
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'calss'
]
data.columns = columns

In [6]:
# 計算重複筆數
data.duplicated().sum()

np.int64(24)

In [7]:
data.index.has_duplicates
data.columns.has_duplicates
data.index.duplicated()
data.columns.duplicated()
dup_index_vals = data.index[data.index.duplicated()].unique()
dup_col_vals   = data.columns[data.columns.duplicated()].unique()
print("重複索引：", dup_index_vals.tolist())
print("重複欄名：", dup_col_vals.tolist())
data.index.value_counts()[lambda s: s > 1]
data.columns.value_counts()[lambda s: s > 1]

重複索引： []
重複欄名： []


Series([], Name: count, dtype: int64)

In [8]:
# 列出五筆
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,calss
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
# 去除所有字串欄位前後的空白
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [10]:
# 將多重空白合併為單一空白，並再次清理字串空白
data = data.apply(
    lambda x: x.str.replace(r'\s+', ' ', regex=True).str.strip() if x.dtype == "object" else x
)

In [18]:
# 指定數值欄位並轉換成數值型態，無法轉換的值設為 NaN
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [31]:
# 重新讀取原始 CSV（無標題列），取得欄位數與欄位索引
df_raw = pd.read_csv(CSV, header=None, na_filter=False, dtype=str)
ncol = df_raw.shape[1]
all_cols = list(range(ncol))
CSV = "adult.data.csv"
import numpy as np
all_cols = list(range(df_raw.shape[1]))
# 找出所有完全重複的資料列，建立重複群組編號
vc = df_raw.value_counts().reset_index(name="count")
dups = vc[vc["count"] > 1].copy()
dups["group_id"] = np.arange(1, len(dups) + 1)

# 將重複群組與原資料對應，產出每組重複資料的索引
dup_groups_members = (
    df_raw.reset_index(names="row_index")
          .merge(dups, on=all_cols, how="inner")
          .sort_values(["group_id", "row_index"], kind="mergesort")
)

# 彙整每個重複群組的大小與對應索引
dup_groups_summary = (
    dup_groups_members
    .groupby("group_id", as_index=False)
    .agg(
        group_size=("row_index", "size"),
        indices=("row_index", lambda s: list(s))
    )
)
# 統計重複資料筆數、群組數與重複總數
n_dup_all = int(dup_groups_members.shape[0])
n_dup_excess = int(df_raw.duplicated(keep='first').sum())
n_groups = int(dup_groups_summary.shape[0])
sum_group_sizes = int(dup_groups_summary["group_size"].sum())

print("A) 重複列（群內所有列皆計） n_dup_all =", n_dup_all)
print("B) 多出來的列（每組 k 計 k-1） n_dup_excess =", n_dup_excess)
print("C) 重複群組數 n_groups =", n_groups)
print("C') 各組 group_size 加總 =", sum_group_sizes, "(應等於 A)")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 160)

# 輸出每組重複資料的詳細內容（顯示重複的列）
for gid, sub in dup_groups_members.groupby("group_id", sort=True):
    idxs = sub["row_index"].tolist()
    print(f"\n=== Group {gid} | size={len(sub)} | indices={idxs} ===")
    # 列出該組的每一列（移除 group_id 欄，保留原始各欄與 row_index）
    print(sub.drop(columns=["group_id"]).to_string(index=False))

A) 重複列（群內所有列皆計） n_dup_all = 47
B) 多出來的列（每組 k 計 k-1） n_dup_excess = 24
C) 重複群組數 n_groups = 23
C') 各組 group_size 加總 = 47 (應等於 A)

=== Group 1 | size=3 | indices=[5843, 13085, 22301] ===
 row_index  0        1      2        3 4              5                6              7      8       9 10 11 12         13     14  count
      5843 25  Private 195994  1st-4th 2  Never-married  Priv-house-serv  Not-in-family  White  Female  0  0 40  Guatemala  <=50K      3
     13085 25  Private 195994  1st-4th 2  Never-married  Priv-house-serv  Not-in-family  White  Female  0  0 40  Guatemala  <=50K      3
     22301 25  Private 195994  1st-4th 2  Never-married  Priv-house-serv  Not-in-family  White  Female  0  0 40  Guatemala  <=50K      3

=== Group 2 | size=2 | indices=[17674, 18699] ===
 row_index  0        1     2        3 4              5                6              7      8     9 10 11 12             13     14  count
     17674 19  Private 97261  HS-grad 9  Never-married  Farming-fishing  Not-in

In [23]:
# 移除完全重複的資料列，保留唯一紀錄
data = data.drop_duplicates()

In [24]:
# 計算重複筆數
data.duplicated().sum()

np.int64(0)

In [32]:
# 看看資料有幾筆、幾個欄位
data.shape

(32561, 15)