In [127]:
import pandas as pd
import numpy as np

dataset_path = 'adult.csv'
data = pd.read_csv(dataset_path, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                                       'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                                       'hours-per-week', 'native-country', 'income'])
# data = pd.read_csv(dataset_path)
print(data.shape)
# 篩選資料集，若有缺失值則整row刪除

# 1. 刪除缺失值
data = data.replace(' ?', np.nan).dropna()
data = data.replace(' <=50K', 0).replace(' >50K', 1)
#print(data.shape)
#print(data.head(3))

keys_immutable = ['race', 'sex', 'native-country']  # Replace with your selected immutable feature names

# choose data[0] as input feature, for all data points, if the immutable feature of data point isn't same as data[0], then drop it
for key in keys_immutable:
    data = data[data[key] == data.iloc[0][key]]

# drop 'education' and 'fnlwgt' features
data = data.drop(['workclass', 'education', 'fnlwgt', 'occupation', 'race', 'sex', 'marital-status', 'relationship', 'native-country'], axis=1)

print(data.shape)

data = data.reset_index(drop=True)
print(data.head(3))

(32561, 15)
(16848, 6)
   age  education-num  capital-gain  capital-loss  hours-per-week  income
0   39             13          2174             0              40       0
1   50             13             0             0              13       0
2   38              9             0             0              40       0


  data = data.replace(' <=50K', 0).replace(' >50K', 1)


In [131]:
# data standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data = scaler.fit_transform(data)
# covert to dataframe
df = pd.DataFrame(data, columns= ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'income'])
#df

# k means for data
from sklearn.cluster import KMeans
x = df.iloc[:, 0:] 
model = KMeans(n_clusters=18, n_init='auto', random_state=1).fit(x)

# draw scatter plot
import matplotlib.pyplot as plt
plt.scatter(x.iloc[:, 0], x.iloc[:, 1], c=model.labels_)
plt.xlabel('age')
plt.ylabel('education-num')
plt.show()

# get centers of each cluster
centers = model.cluster_centers_
print(centers)



[[-2.91142097e-02  1.40109120e+00  1.31857162e-01 -2.41687797e-01
   2.90858753e+00  1.41364716e+00]
 [-3.83170367e-01 -4.41916832e-01 -1.44164908e-01 -2.41687797e-01
  -1.32448339e+00 -7.07390096e-01]
 [ 1.87226098e-01 -2.68924945e-01  8.18278734e-01 -2.41687797e-01
  -8.87341056e-02  1.41364716e+00]
 [ 9.66728338e-01 -3.32114168e-01 -1.32720099e-01 -2.41687797e-01
   6.04812670e-01 -7.07390096e-01]
 [ 1.21821111e+00 -4.14944908e-01 -1.65421391e-01  4.18584716e+00
   1.91554453e-01  1.41364716e+00]
 [ 2.18342278e-01  1.69051760e+00  1.13404651e+01 -2.41687797e-01
   2.37472033e-01  1.41364716e+00]
 [ 7.84216239e-01  1.21804623e+00 -1.41860367e-01 -2.41687797e-01
  -4.19357213e-01 -7.07390096e-01]
 [ 2.28346100e-01  1.19072408e+00 -6.06936051e-02 -2.41687797e-01
   1.40023908e+00  1.41364716e+00]
 [ 2.22226809e+00 -4.48070386e-01 -1.43539775e-01 -2.28908316e-01
  -2.70208041e+00 -7.07390096e-01]
 [-2.19344983e-01 -3.59336548e-01 -1.49748903e-01 -2.41687797e-01
   4.51220146e-01 -7.0739

In [None]:
# 透過Elbow法找出最佳K值(資料群數)
Dist = []
K = range(1,18)
for k in K:
    kmeanModel = KMeans(n_clusters=k, n_init='auto', random_state=1).fit(x)
    kmeanModel.fit(x)
    Dist.append(kmeanModel.inertia_)
print(Dist)
plt.plot(range(1,18), Dist, 'bx-')
plt.show()



In [125]:
import pandas as pd
import queue
from sklearn.neighbors import NearestNeighbors
import numpy as np

q = queue.Queue()
q.put(1)

while not q.empty():
    index = q.get()

    df = pd.DataFrame(data)

    # Extract features for KNN
    features = df[[0, 1, 4]]

    # choose data[0] as target data point for which you want to find neighbors
    target_data_point = features.iloc[index]

    # Create a KNN model
    knn_model = NearestNeighbors(n_neighbors=6, algorithm='auto')  # Adjust n_neighbors as needed

    # Fit the model with your dataset
    knn_model.fit(features)

    # Find the indices of the nearest neighbors (excluding the target_data_point itself)
    _, indices = knn_model.kneighbors([target_data_point])
    neighbor_indices: np.ndarray = indices[0]
    neighbor_indices = np.delete(neighbor_indices, np.where(neighbor_indices == index))

    # for i in neighbor_indices:
    #     q.put(i)

    # Print the neighbors
    print("Nearest neighbors:")
    print(df.iloc[neighbor_indices])

    # store the index of the nearest neighbors
    neighbor_indices = neighbor_indices.tolist()

Nearest neighbors:
              0         1         2         3         4        5
15081  0.810470  1.097452 -0.165421 -0.241688 -2.961453 -0.70739
10727  1.197121  1.097452 -0.165421 -0.241688 -2.372177 -0.70739
8454   0.423820  1.097452 -0.165421 -0.241688 -2.372177 -0.70739
4792   0.887800  1.097452 -0.165421 -0.241688 -2.961453 -0.70739
2774   0.346489  1.097452 -0.165421 -0.241688 -2.624724 -0.70739


In [None]:
# generate a cost function that takes in two data point and returns a cost table 
# the cost table is the cost of the two data point to each other
def generate_cost_table(data_point_1, data_point_2):
# use their features to calculate the cost by a polynomial function
    cost = 0
    for i in range(len(data_point_1)):
        cost += (data_point_1[i] - data_point_2[i]) ** 2
    return cost