In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('dataset.csv')

# Drop columns that are categorical or unnecessary for dimensionality reduction
data_cleaned = data.drop(columns=['STN Code', 'Name of Monitoring Location', 'State Name', 'Type Water Body'])

# Handle missing data by dropping rows with missing values (or alternatively you can impute missing values)
data_cleaned = data_cleaned.dropna()

# Standardize the data (important for dimensionality reduction)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_cleaned)

# Apply PCA to reduce dimensionality to 50 dimensions first (for t-SNE/UMAP to work better)
pca = PCA(n_components=50)
data_pca = pca.fit_transform(data_scaled)

# Now apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
data_tsne = tsne.fit_transform(data_pca)

# Apply UMAP for 2D visualization
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
data_umap = umap_model.fit_transform(data_pca)

# Plotting t-SNE results
plt.figure(figsize=(10, 8))
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c='blue', edgecolor='k', s=100, alpha=0.5)
plt.title("t-SNE Dimensionality Reduction")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()

# Plotting UMAP results
plt.figure(figsize=(10, 8))
plt.scatter(data_umap[:, 0], data_umap[:, 1], c='red', edgecolor='k', s=100, alpha=0.5)
plt.title("UMAP Dimensionality Reduction")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()


ValueError: could not convert string to float: '-'

In [7]:
# --- Add these imports here too, just in case ---
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd # Assuming pandas is needed if X_orig is DataFrame

# --- Enable Matplotlib Interactive Backend ---
%matplotlib notebook

# --- Verification before plotting ---
print("--- Checking variables before plotting ---")
variables_ok = True
try:
    print(f"Type of X_pca_reduced_df: {type(X_pca_reduced_df)}, Shape: {X_pca_reduced_df.shape}")
except NameError:
    print("Error: X_pca_reduced_df is not defined.")
    variables_ok = False

try:
    print(f"Type of hard_labels: {type(hard_labels)}, Length: {len(hard_labels)}")
except NameError:
    print("Error: hard_labels is not defined.")
    variables_ok = False

try:
    print(f"Type of membership_u: {type(membership_u)}, Shape: {membership_u.shape}")
except NameError:
    print("Error: membership_u is not defined.")
    variables_ok = False

try:
    print(f"Value of optimal_c: {optimal_c}")
except NameError:
    print("Error: optimal_c is not defined.")
    variables_ok = False

if not variables_ok:
    print("!!! Please ensure Cell 1 ran correctly and defined the necessary variables. !!!")
print("--- Variable check complete ---")
# --- End of Verification ---


def plot_interactive_clusters_3d(X_orig, labels, k, title_suffix="Hard Labels"):
    # ... (rest of function is the same) ...
    plt.show() # Ensure this is here

def plot_interactive_fuzzy_partition_3d(X_orig, u, labels, k, title_suffix="Fuzzy Partition (Alpha=Certainty)"):
    # ... (rest of function is the same) ...
    plt.show() # Ensure this is here


# --- Generate the plots if data loaded AND VERIFIED ---
# Use the 'variables_ok' flag we just set
if variables_ok:
     print("Variables seem okay, attempting to generate plots...")
     plot_interactive_clusters_3d(X_pca_reduced_df, hard_labels, optimal_c)
     plot_interactive_fuzzy_partition_3d(X_pca_reduced_df, membership_u, hard_labels, optimal_c)
else:
     print("Skipping interactive plots due to missing data or FCM results (based on variable check).")


--- Checking variables before plotting ---
Type of X_pca_reduced_df: <class 'pandas.core.frame.DataFrame'>, Shape: (620, 10)
Type of hard_labels: <class 'numpy.ndarray'>, Length: 620
Type of membership_u: <class 'numpy.ndarray'>, Shape: (2, 620)
Value of optimal_c: 2
--- Variable check complete ---
Variables seem okay, attempting to generate plots...
