In [5]:
import numpy as np

# Specify the path to your text file
file_path = 'gene_expression_matrix.txt'

# Use numpy.loadtxt to read the file
data = np.loadtxt(file_path)

print(data)


[[8.5894163e+03 5.4682409e+03 4.2634075e+03 ... 7.5677500e+01
  8.3522500e+01 2.8701250e+01]
 [3.8257050e+03 6.9703614e+03 5.3699688e+03 ... 4.2656250e+01
  1.6092500e+01 1.5156250e+01]
 [3.2303287e+03 3.6944500e+03 3.4007400e+03 ... 5.7598750e+01
  7.4887500e+00 3.1812500e+01]
 ...
 [6.7306250e+03 3.4721250e+03 2.5594625e+03 ... 1.3352125e+02
  9.3098750e+01 7.4325000e+00]
 [7.4720100e+03 3.6539341e+03 2.7282162e+03 ... 7.7215000e+01
  4.9862500e+01 3.9631250e+01]
 [7.1211737e+03 5.3466477e+03 4.2541175e+03 ... 8.9797500e+01
  4.0756250e+01 3.7982500e+01]]


In [6]:
data.shape

(62, 2000)

In [7]:
#Prints the first three columns of matrix
first_three_columns = data[:, :3]
print(first_three_columns)

[[ 8589.4163  5468.2409  4263.4075]
 [ 3825.705   6970.3614  5369.9688]
 [ 3230.3287  3694.45    3400.74  ]
 [ 7126.5988  3779.0682  3705.5537]
 [ 9330.6787  7017.2295  4723.7825]
 [14876.407   3201.9045  2327.6263]
 [ 4469.09    5167.0568  4773.68  ]
 [ 4913.7988  5215.0477  4288.6162]
 [ 7144.4062  2071.4023  1619.2762]
 [ 5382.3938  3848.4432  3372.4887]
 [ 7434.8213  6471.2114  5029.6175]
 [ 4214.9     2213.3568  1611.5188]
 [ 8865.4587  5447.1864  4887.0575]
 [ 5934.8888  3744.9886  3528.8337]
 [ 5821.6175  3748.2477  3439.9538]
 [ 9767.0275  9785.775   8605.0438]
 [13324.729   9505.0341  7740.9875]
 [12977.712   7565.6159  5735.2   ]
 [ 8753.2388  8978.1341  7777.8412]
 [ 5012.02    1383.4886  1269.6487]
 [ 6904.8012  2260.7773  1987.0012]
 [ 8347.9838  9852.2977  8178.965 ]
 [ 5100.5363  3343.3205  2925.67  ]
 [ 4554.5762  3139.3114  2958.4025]
 [ 5466.93   10152.273   7760.9175]
 [ 4201.5075  2425.6273  2228.8175]
 [ 9128.1188  5502.7159  4590.1338]
 [ 3799.0888  5665.7795  515

In [8]:
#Standardizes the data
standardized_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)

# Print the first three columns of the standardized matrix
print(standardized_data[:, :3])

[[ 0.51292947  0.23088092  0.09353633]
 [-1.03981707  0.92273048  0.70714741]
 [-1.23388184 -0.58609512 -0.3848306 ]
 [ 0.03611955 -0.5471215  -0.21580511]
 [ 0.75454626  0.94431708  0.34882377]
 [ 2.56219368 -0.81295267 -0.97989429]
 [-0.83010373  0.09216096  0.37649296]
 [-0.68514951  0.1142647   0.10751508]
 [ 0.04192392 -1.33364156 -1.37268904]
 [-0.53240951 -0.51516863 -0.40049653]
 [ 0.13658563  0.69283101  0.51841559]
 [-0.91295775 -1.26825988 -1.37699068]
 [ 0.60290635  0.2211836   0.43936313]
 [-0.35232204 -0.56281795 -0.3138    ]
 [-0.38924318 -0.56131687 -0.36308574]
 [ 0.89677551  2.2194591   2.50106326]
 [ 2.0564194   2.09015492  2.02192621]
 [ 1.94330813  1.19689394  0.90967565]
 [ 0.56632792  1.8474743   2.04236235]
 [-0.653134   -1.65048218 -1.56656471]
 [-0.03617607 -1.24641886 -1.16877794]
 [ 0.43423379  2.25009826  2.26479377]
 [-0.62428184 -0.74781902 -0.64826672]
 [-0.80223924 -0.84178193 -0.63011587]
 [-0.50485467  2.38826146  2.0329778 ]
 [-0.91732308 -1.17049193

In [15]:

# Implement k-means clustering with cosine similarity
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def k_means_clustering(data, k, max_iterations=100):
    num_samples, num_features = data.shape
    # Initialize centroids using k-means++ initialization
    centroids = [data[np.random.choice(num_samples)]]
    for _ in range(1, k):
        distances = np.array([min([np.linalg.norm(sample - centroid) for centroid in centroids]) for sample in data])
        probabilities = distances / distances.sum()
        next_centroid = data[np.random.choice(num_samples, p=probabilities)]
        centroids.append(next_centroid)
    centroids = np.array(centroids)
    
    labels = np.zeros(num_samples)

    for _ in range(max_iterations):
        # Assign each sample to the nearest centroid
        for i in range(num_samples):
            sample = data[i, :]
            similarities = [cosine_similarity(sample, centroid) for centroid in centroids]
            labels[i] = np.argmax(similarities)

        # Update centroids based on mean of assigned samples
        for j in range(k):
            assigned_samples = data[labels == j]
            if len(assigned_samples) > 0:
                centroids[j] = np.mean(assigned_samples, axis=0)

    # Calculate squared error distortion
    distortion = 0
    for i in range(num_samples):
        sample = data[i, :]
        centroid = centroids[int(labels[i])]
        distortion += np.linalg.norm(sample - centroid)**2

    # Divide by the number of samples to get average distortion per sample
    average_distortion = distortion / num_samples

    return labels, average_distortion, centroids
# Run k-means algorithm 5 times with different initial centroids
num_clusters = 2
num_runs = 5

# Variables to store the best clustering solution
best_distortion = float('inf')
best_labels = None
best_centroids = None

# List to store distortions for each run
run_distortions = []

for run in range(num_runs):
    cluster_labels, distortion, centroids = k_means_clustering(standardized_data, num_clusters)
    run_distortions.append(distortion)



    if distortion < best_distortion:
        best_distortion = distortion
        best_labels = cluster_labels
        best_centroids = centroids

# Print the best clustering solution
print(f"\nBest Clustering Solution - Average Squared Error Distortion per Sample: {best_distortion}")
print("Best Cluster Labels:", best_labels)

# Calculate the percentages of cancer patients in each cluster
cancer_patient_indices = range(40)  # Indices of cancer patients (first 40 rows)
cluster_0_percentage = np.sum(best_labels[cancer_patient_indices] == 0) / len(cancer_patient_indices) * 100
cluster_1_percentage = np.sum(best_labels[cancer_patient_indices] == 1) / len(cancer_patient_indices) * 100

print(f"Percentage of Cancer Patients in Cluster 0: {cluster_0_percentage:.2f}%")
print(f"Percentage of Cancer Patients in Cluster 1: {cluster_1_percentage:.2f}%")

# Determine the cluster to which the 62nd patient belongs
patient_62_cluster = best_labels[-1]
print(f"\nCluster Assignment for Patient 62: {patient_62_cluster}")

# Print all run distortions separately
print("\nSquared Error Distortions for Each Run:")
for run, distortion in enumerate(run_distortions):
    print(f"Run {run + 1}: {distortion}")


    


Best Clustering Solution - Average Squared Error Distortion per Sample: 1403.2046988217805
Best Cluster Labels: [1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0.]
Percentage of Cancer Patients in Cluster 0: 37.50%
Percentage of Cancer Patients in Cluster 1: 62.50%

Cluster Assignment for Patient 62: 0.0

Squared Error Distortions for Each Run:
Run 1: 1409.3311279185232
Run 2: 1409.3311279185232
Run 3: 1403.2046988217805
Run 4: 1409.3311279185232
Run 5: 1403.2046988217805


The gene expression data produced a noteworthy result from the k-means clustering analysis, with an ideal solution exhibiting an average squared error distortion per sample of roughly 1403.20. Cluster 0 consisted of 37.50% of cancer patients, indicating a larger number of healthy patients, whereas Cluster 1 had 62.50% of the cancer patients, indicating a higher rate of cancer cases. The clustering indicated two distinct groupings. Given that Patient 62 was placed in Cluster 0, it may be inferred that the algorithm found shared gene expression patterns among the patients in this cluster. But it's important to proceed with caution when interpreting these results, since k-means clustering is an unsupervised technique that just looks for patterns in the data. 





