In [1]:
# import requirements
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA
import os
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.cluster import KMeans

In [2]:

# Assuming X is your dataset loaded from previous steps
def load_data_with_labels(path, label):
    all_data = []
    for file_name in os.listdir(path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(path, file_name)
            data = pd.read_csv(file_path)
            all_data.append(data)
    combined_data = pd.concat(all_data, ignore_index=True)
    labels = np.full(combined_data.shape[0], label)
    return combined_data.values, labels

# Paths to your data
region1_path = r"Datasets/Sample Data/Region1"
region2_path = r"Datasets/Sample Data/Region2"
region3_path = r"Datasets/Sample Data/Region3"
region4_path = r"Datasets/Sample Data/Region4"
region5_path = r"Datasets/Sample Data/Region5"

# Load the data from each region and assign labels
X1, y1 = load_data_with_labels(region1_path, 1)
X2, y2 = load_data_with_labels(region2_path, 2)
X3, y3 = load_data_with_labels(region3_path, 3)
X4, y4 = load_data_with_labels(region4_path, 4)
X5, y5 = load_data_with_labels(region5_path, 5)

# Combine data from all regions
X = np.concatenate((X1, X2, X3, X4, X5), axis=0)
y = np.concatenate((y1, y2, y3, y4, y5), axis=0)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'Datasets/Sample Data/Region1'

In [None]:
# Histogram for the first feature
plt.figure(figsize=(6, 4))
plt.hist(X[:, 0], bins=30)
plt.title('Histogram of Feature 1')
plt.xlabel('Feature 1')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot of the first two features
plt.figure(figsize=(6, 4))
plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
plt.title('Scatter Plot of First Two Features')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
# K-Means Clustering
# Reducing the dimensionality for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Applying K-Means
kmeans = KMeans(n_clusters=3, random_state=1)
clusters = kmeans.fit_predict(X_pca)

# Visualizing the clusters
plt.figure(figsize=(6, 4))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.title('K-Means Clustering')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

In [None]:
# Pair plot for the first few features
sns.pairplot(pd.DataFrame(X[:, :4]))
plt.suptitle('Pair Plot of First Four Features', y=1.02)
plt.show()

In [None]:
# Box plot for the first feature
plt.figure(figsize=(6, 4))
sns.boxplot(data=X[:, 0])
plt.title('Box Plot of Feature 1')
plt.show()

In [None]:
# Heatmap of the correlation matrix
plt.figure(figsize=(6, 4))
sns.heatmap(pd.DataFrame(X).corr(), annot=True, cmap='coolwarm')
plt.title('Heatmap of Correlation Matrix')
plt.show()