In [None]:
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Data Preprocessing
# Convert categorical data to numerical format
df['State'] = df['State'].astype('category').cat.codes

# Assuming no NaN values as per the dataset info
# If NaN values are present, uncomment the following line
# df.fillna(df.mean(), inplace=True)

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Dimensionality Reduction using UMAP
umap = UMAP(n_components=3, random_state=42)  # 3 components as an example
df_reduced = umap.fit_transform(df_scaled)

# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # 3 clusters as an example
clusters = kmeans.fit_predict(df_reduced)

# Feature Extraction for Classification
# Here we use PCA for simplicity, but this can be adjusted
pca = PCA(n_components=5)  # 5 components as an example
df_features = pca.fit_transform(df_scaled)

# Classification
X_train, X_test, y_train, y_test = train_test_split(df_features, clusters, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predictions and Evaluation
predictions = classifier.predict(X_test)
classification_report_output = classification_report(y_test, predictions)

# Return the classification report for evaluation
classification_report_output



In [5]:
!pip install umap-learn


Collecting umap-learn
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.11-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: umap-learn
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.5-py3-none-any.whl size=86832 sha256=711e5059f3ce07bf4c96321f1c2a83aa015c4f0c6a5ceefbdcac7f7b4c6e52b7
  Stored in directory: /root/.cache/pip/wheels/3a/70/07/428d2b58660a1a3b431db59b806a10da736612ebbc66c1bcc5
Successfully built umap-learn
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.11 umap-learn-0.5.5


In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [7]:
df = pd.read_csv('/content/availability-of-boys-toilet.csv')

In [8]:
# Data Preprocessing
# Convert categorical data to numerical format
df['State'] = df['State'].astype('category').cat.codes

In [9]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [10]:
# Dimensionality Reduction using UMAP
umap = UMAP(n_components=3, random_state=42)  # 3 components as an example
df_reduced = umap.fit_transform(df_scaled)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [11]:
# Clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # 3 clusters as an example
clusters = kmeans.fit_predict(df_reduced)



In [12]:
# Feature Extraction for Classification
# Here we use PCA for simplicity, but this can be adjusted
pca = PCA(n_components=5)  # 5 components as an example
df_features = pca.fit_transform(df_scaled)

In [13]:
# Classification
X_train, X_test, y_train, y_test = train_test_split(df_features, clusters, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [14]:
# Predictions and Evaluation
predictions = classifier.predict(X_test)
classification_report_output = classification_report(y_test, predictions)


In [15]:
# Return the classification report for evaluation
classification_report_output


'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00         3\n           1       1.00      1.00      1.00         4\n           2       1.00      1.00      1.00         4\n\n    accuracy                           1.00        11\n   macro avg       1.00      1.00      1.00        11\nweighted avg       1.00      1.00      1.00        11\n'

In [17]:
# Predictions and Evaluation
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         4

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [24]:
# Combine reduced data with cluster labels
reduced_df = pd.DataFrame(df_reduced, columns=['UMAP1', 'UMAP2', 'UMAP3'])
reduced_df['Cluster'] = clusters

In [25]:
reduced_df.to_csv('reduced_data_with_clusters.csv', index=False)