In [None]:

import pandas as pd

# Load the dataset from GitHub
file_path = "https://raw.githubusercontent.com/fangayou90/Unsupervised_Project_EDA/main/MODIS_Vegetation_Indices.csv"
modis_data = pd.read_csv(file_path)

# Display the shape of the dataset
print("Dataset Shape:", modis_data.shape)

# Show the first few rows of the dataset
modis_data.head()
    

In [None]:

# Summary statistics
modis_data.describe()
    

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Pair plot for key variables
sns.pairplot(modis_data[['NDVI', 'EVI', 'Latitude', 'Longitude']])
plt.show()
    

In [None]:

# Correlation matrix
corr_matrix = modis_data[['NDVI', 'EVI', 'Latitude', 'Longitude']].corr()

# Heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix Heatmap")
plt.show()
    

In [None]:

# NDVI distribution plot
sns.histplot(modis_data['NDVI'], kde=True)
plt.title("NDVI Distribution")
plt.xlabel("NDVI")
plt.show()
    

In [None]:

# Scatter plot of Latitude vs NDVI
sns.scatterplot(data=modis_data, x='Latitude', y='NDVI')
plt.title("Latitude vs NDVI")
plt.xlabel("Latitude")
plt.ylabel("NDVI")
plt.show()
    

In [None]:

# CDF of NDVI
sns.ecdfplot(data=modis_data, x='NDVI')
plt.title("CDF of NDVI")
plt.xlabel("NDVI")
plt.ylabel("Proportion")
plt.show()
    

In [None]:

# Check for missing data
missing_data = modis_data.isnull().sum()
print("Missing Data:\n", missing_data)
    

In [None]:

from sklearn.preprocessing import StandardScaler

# Select numerical features for analysis
numerical_features = modis_data.select_dtypes(include=['float64', 'int64']).columns

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(modis_data[numerical_features])

# Convert scaled data back to a DataFrame for consistency
scaled_df = pd.DataFrame(scaled_data, columns=numerical_features)
scaled_df.head()
        

In [None]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_data)

# Add cluster labels to the dataset
modis_data['KMeans_Cluster'] = kmeans_labels

# Visualization of KMeans clustering results (2D projection)
plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=kmeans_labels, cmap='viridis', s=10)
plt.title('KMeans Clustering (2D Projection)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster')
plt.show()
        

# Added silhouette score for evaluating clustering performance
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data_scaled, kmeans.labels_)
print(f'Silhouette Score for KMeans: {silhouette_avg}')

In [None]:

from sklearn.cluster import DBSCAN

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(scaled_data)

# Add DBSCAN cluster labels to the dataset
modis_data['DBSCAN_Cluster'] = dbscan_labels

# Visualization of DBSCAN clustering results (2D projection)
plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=dbscan_labels, cmap='plasma', s=10)
plt.title('DBSCAN Clustering (2D Projection)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster')
plt.show()
        

In [None]:

from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Visualization of PCA results
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=kmeans_labels, cmap='viridis', s=10)
plt.title('PCA Dimensionality Reduction')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster (KMeans)')
plt.show()
        

# Added silhouette score for evaluating clustering performance
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(data_scaled, kmeans.labels_)
print(f'Silhouette Score for KMeans: {silhouette_avg}')

# Added explained variance ratio analysis for PCA
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained Variance Ratio: {explained_variance_ratio}')

### Data Loading and Overview
The dataset is loaded from an external source into a Pandas DataFrame. The first step involves inspecting the dataset's structure using the `.shape` and `.head()` functions. This provides insight into the number of samples (rows) and features (columns) available for analysis. Initial inspection is crucial to ensure data quality and compatibility with the planned analysis steps. For this project, the MODIS Vegetation Indices dataset contains over 1,000 samples, surpassing the minimum required for meaningful unsupervised learning. This step ensures that the dataset meets size requirements and sets the foundation for further exploration.

### Descriptive Statistics
Descriptive statistics provide a quick summary of the dataset's numerical features, helping to understand the central tendency, spread, and range of each variable. Metrics such as the mean, standard deviation, minimum, and maximum are computed using the `.describe()` method. These statistics are key to identifying potential outliers and understanding the overall distribution of the data. For example, the NDVI and EVI indices' ranges indicate vegetation health, which is critical for understanding patterns in this dataset. Detecting anomalies at this stage helps guide the subsequent preprocessing and visualization efforts.

### Correlation Matrix Heatmap
A heatmap of the correlation matrix visualizes the relationships between variables in the dataset. Each cell in the heatmap represents the strength and direction of the correlation between two variables, ranging from -1 (perfect negative correlation) to +1 (perfect positive correlation). For this dataset, a strong positive correlation is observed between NDVI and EVI, which are both vegetation indices. Annotating the heatmap with numerical values further clarifies these relationships. This visualization is essential for identifying variables that may influence clustering results and for understanding inter-variable dependencies.

### Visualization of NDVI Distribution
Analyzing the distribution of NDVI values provides insights into the dataset's underlying structure. A histogram is used to visualize the frequency of different NDVI values, helping to identify patterns such as skewness or the presence of outliers. NDVI, which stands for Normalized Difference Vegetation Index, measures vegetation health and is a critical feature in this dataset. Understanding its distribution ensures that the clustering and dimensionality reduction methods applied later are informed by the dataset's characteristics. This step sets the stage for more detailed analysis and interpretation.