In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler

In [None]:


# Read the CSV file
df = pd.read_csv('../netstats/processed/d_corr-netstats-threshold-0.4.csv', parse_dates=['date'])

# Extract density and global_cc columns
X = df[['density', 'global_cc']].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform MDS
mds = MDS(n_components=2, random_state=42)
X_mds = mds.fit_transform(X_scaled)

# Create the plot
plt.figure(figsize=(12, 8))
plt.scatter(X_mds[:, 0], X_mds[:, 1], alpha=0.6)

# Add labels for each point
for i, date in enumerate(df['date']):
    plt.annotate(date.strftime('%Y-%m-%d'), (X_mds[i, 0], X_mds[i, 1]), fontsize=8, alpha=0.7)

plt.title('MDS of Density vs Global Clustering Coefficient')
plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.tight_layout()
plt.show()

# Find pairs that are close together
def find_close_pairs(X_mds, df, threshold=0.5):
    close_pairs = []
    for i in range(len(X_mds)):
        for j in range(i+1, len(X_mds)):
            distance = np.linalg.norm(X_mds[i] - X_mds[j])
            if distance < threshold:
                close_pairs.append((df['date'].iloc[i].strftime('%Y-%m-%d'), df['date'].iloc[j].strftime('%Y-%m-%d')))
    return close_pairs

close_pairs = find_close_pairs(X_mds, df)
print("Pairs of dates with similar density and global_cc values:")
for pair in close_pairs:
    print(f"{pair[0]} and {pair[1]}")