In [33]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
data = pd.read_csv('/Users/jeevandeep/Downloads/ds_salaries_assignment2.csv')

# Prepare the feature set X by excluding the target column 'salary_in_usd'
X = data.drop(columns=['salary_in_usd'])

# Scale the feature set using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train a KMeans clustering algorithm on the scaled feature set with 3 clusters, using random_state=0
kmeans_3 = KMeans(n_clusters=3, random_state=0)
kmeans_3.fit(X_scaled)
cluster_counts_3 = pd.Series(kmeans_3.labels_).value_counts()
largest_cluster_size_3 = cluster_counts_3.max()

# Calculate silhouette scores for a range of cluster numbers to find the optimal number
range_clusters = range(2, 10)
silhouette_scores = []
for n_clusters in range_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(score)

# Determine the best number of clusters based on the highest silhouette score
best_n_clusters = range_clusters[silhouette_scores.index(max(silhouette_scores))]
kmeans_best = KMeans(n_clusters=best_n_clusters, random_state=0)
kmeans_best.fit(X_scaled)
cluster_counts_best = pd.Series(kmeans_best.labels_).value_counts()
largest_cluster_size_best = cluster_counts_best.max()

# Analyze the characteristics of clusters
data['cluster_labels'] = kmeans_best.labels_
smallest_cluster_index = cluster_counts_best.idxmin()
largest_cluster_index = cluster_counts_best.idxmax()
smallest_cluster_data = data[data['cluster_labels'] == smallest_cluster_index]
largest_cluster_data = data[data['cluster_labels'] == largest_cluster_index]
smallest_non_us = (smallest_cluster_data['company_location'] == 0).all()
majority_remote = (data.groupby('cluster_labels')['remote_ratio'].mean() > 50).all()
largest_full_time = (largest_cluster_data['employment_type_FT'] == 1).all()
smallest_full_time = (smallest_cluster_data['employment_type_FT'] == 1).all()
largest_us = (largest_cluster_data['company_location'] == 1).all()

# Print answers to questions
print(f"Q1: Average experience level in X: {data['experience_level'].mean():.2f}")
print(f"Q2: Average experience level in scaled X: {X_scaled['experience_level'].mean():.2f}")
print(f"Q3: Number of instances in the largest cluster (3 clusters): {largest_cluster_size_3}")
print(f"Q4: Highest silhouette score: {max(silhouette_scores):.2f}")
print(f"Q5: Best n_clusters: {best_n_clusters} clusters")
print(f"Q6: Number of instances in the largest cluster ({best_n_clusters} clusters): {largest_cluster_size_best}")
print(f"Q7: All non-US in smallest cluster: {smallest_non_us}, Majority remote: {majority_remote}, " +
      f"All FT in largest cluster: {largest_full_time}, All FT in smallest cluster: {smallest_full_time}, " +
      f"All US in largest cluster: {largest_us}")


Q1: Average experience level in X: 1.82
Q2: Average experience level in scaled X: 0.61
Q3: Number of instances in the largest cluster (3 clusters): 329
Q4: Highest silhouette score: 0.41
Q5: Best n_clusters: 9 clusters
Q6: Number of instances in the largest cluster (9 clusters): 143
Q7: All non-US in smallest cluster: False, Majority remote: False, All FT in largest cluster: True, All FT in smallest cluster: False, All US in largest cluster: True
