In [None]:
# Install the required packages for the project

%pip install -r requirements.txt

In [None]:
# Ready the environment variables from the .env file

import os
from dotenv import load_dotenv

load_dotenv(override=True) 

TOKENS = os.getenv("TOKENS", "").split(",")

In [None]:
# Define constants for API usage

API_URL = 'https://api.github.com'

LIMIT_REQUESTS_BY_TOKEN_PER_HOUR = 100000
LIMIT_REQUESTS_BY_IP_PER_HOUR = 60

In [None]:
# Set API request utils (RUN IT ONLY ONCE TO KEEP COUNTER SYNCED)

import requests

# Initialize the counter for each token
counter = {token: LIMIT_REQUESTS_BY_TOKEN_PER_HOUR for token in TOKENS}
counter.update({None: LIMIT_REQUESTS_BY_IP_PER_HOUR})

# Get the next avaliable token to use for the request
def get_avaliable_token(): 
    token = None
    for t in counter:
        if counter[t] > 0:
            token = t
            break
        else:
            invalidate_token(t)
    
    return token
    
# Invalidate a token if it results in a error, avoiding to use it again
def invalidate_token(token):
    if token in counter:
        del counter[token]
        print(f"Token {token} has been invalidated.")
    else:
        print(f"Token {token} not found in the list.")

# Mark a token as used, reducing the counter
def used_token(token):
    if token in counter:
        counter[token] -= 1
        # print(f"Token {token} used, remaining requests: {counter[token]}")
        if counter[token] == 0:
            invalidate_token(token)
    else:
        print(f"Token {token} not found in the list.")

# Function to make a request to the API
def request (path):
    # Set the URL to the API endpoint
    url = f'{API_URL}/{path}'
    if (path.startswith(API_URL)):
        url = path

    # Define headers with the token
    token = get_avaliable_token()
    headers = {}
    if token is not None:
        headers['Authorization'] = f'Bearer {token}'

    # Make the GET request
    response = requests.get(url, headers=headers)
    if response.ok:
        # print(f"Request successful for token: {token}")
        used_token(token)
    else:
        if response.status_code == 401:
            print(f"Unauthorized access for token: {token}")
            used_token(token)

        elif response.status_code == 403:
            print(f"Rate limit exceeded for token: {token}")
            invalidate_token(token)
            # Retry with the next available token
            if (token is not None):
                print("Retrying with the next available token...")
                return request(path)
            
        elif response.status_code == 404:
            print("Resource not found")

        elif response.status_code == 429:
            print("Rate limit exceeded for IP")
            invalidate_token(token)

    return response

In [None]:
# Fetch and form/save initial graph

import networkx as nx
import time

# Configuration
SEED_USER = 'gabriel-dp' # Initial node
MAX_REQUESTS = 15000 # Max Requests (generally keep 5k per token)
MAX_DEPTH = 3 # How deep will we search
BLOCKED_USERS = {'torvalds'} # Can desconsider some users
SLEEP_TIME = 0.2  # Delay between requests

# Cache for using less requests
followers_cache = {}
following_cache = {}

# Fetch followers/following (Can limit number of pages)
def get_all_follow_data(username, follow_type='followers'):
    print(f"getting all follow data for {username}")
    if follow_type == 'followers' and username in followers_cache:
        return followers_cache[username]
    if follow_type == 'following' and username in following_cache:
        return following_cache[username]

    per_page = 100
    page = 1
    pages_limit = 10
    results = []

    while True:
        path = f'users/{username}/{follow_type}?per_page={per_page}&page={page}'
        response = request(path)
        if not response or not response.ok:
            break
        data = response.json()
        if not data:
            break
        results.extend([user['login'] for user in data])
        if len(data) < per_page or page >= pages_limit:
            break
        page += 1
        time.sleep(SLEEP_TIME)

    if follow_type == 'followers':
        followers_cache[username] = results
    else:
        following_cache[username] = results

    return results

# Build graph (mutual)
def build_mutual_graph(seed_user, max_requests=10000, max_depth=3):
    G = nx.Graph()
    visited = set()
    queue = [(seed_user, 0)]
    total_requests = 0

    while queue and total_requests < max_requests:
        user, level = queue.pop(0)

        if user in visited or user in BLOCKED_USERS or level > max_depth:
            continue

        print(f"Processing {user} (level {level}) | Total requests: {total_requests}")
        visited.add(user)

        try:
            followers = get_all_follow_data(user, 'followers')
            total_requests += 1
            following = get_all_follow_data(user, 'following')
            total_requests += 1
        except Exception as e:
            print(f"Error fetching data for {user}: {e}")
            continue

        mutuals = set(followers).intersection(following)
        for mutual in mutuals:
            if mutual in BLOCKED_USERS:
                continue
            G.add_edge(user, mutual)

            if mutual not in visited:
                queue.append((mutual, level + 1))

        time.sleep(SLEEP_TIME)

    print(f"Finished: {len(G.nodes())} nodes, {len(G.edges())} edges, {total_requests} requests")
    return G

# Save graph
def save_graph(graph, filename='github_mutual_follow_graph.graphml'):
    nx.write_graphml(graph, filename)
    print(f"Graph saved to {filename}")

# Entry piont
if __name__ == "__main__":
    graph = build_mutual_graph(SEED_USER, MAX_REQUESTS, MAX_DEPTH)
    save_graph(graph)


In [None]:
# Plot and print the caracterization of the network

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load graph from file
filename = 'github_mutual_follow_graph.graphml'
G_original = nx.read_graphml(filename)
mutual_edges = [(u, v) for u, v in G_original.edges() if G_original.has_edge(v, u)]
G = nx.Graph()
G.add_edges_from(mutual_edges)

print("Número de vértices:", G.number_of_nodes())
print("Número de arestas:", G.number_of_edges())

# Plot degree distribution
degree_sequence = [d for n, d in G.degree()]
plt.figure(figsize=(10, 5))
plt.hist(degree_sequence, bins=30, color='skyblue', edgecolor='black')
plt.title("Distribuição de graus")
plt.xlabel("Grau")
plt.ylabel("Frequência")
plt.grid(True)
plt.show()

# Calculate clustering
clustering = nx.clustering(G)
avg_clustering = np.mean(list(clustering.values()))
print("Coeficiente de clustering médio:", avg_clustering)

full_clustering_nodes = [n for n, c in clustering.items() if c == 1.0]
print("Número de nós com clustering = 1.0:", len(full_clustering_nodes))

# Plotting degrees of clusterings of 1.0
sizes = [G.degree(n) for n in full_clustering_nodes]
print("Graus médios dos nós com clustering = 1.0:", np.mean(sizes))
plt.hist(sizes, bins=20, color='orange', edgecolor='black')
plt.title("Distribuição de grau dos nós com clustering = 1.0")
plt.xlabel("Grau")
plt.ylabel("Frequência")
plt.show()

# Calculates centralities (degree and eigenvector)
degree_centrality = nx.degree_centrality(G)
try:
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
except nx.PowerIterationFailedConvergence:
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
print("eigenvector centrality done")

# Top 5 by degree centrality
top_5_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 usuários por centralidade de grau:")
for node, centrality in top_5_degree:
    print(f"Nó: {node}, Centralidade de grau: {centrality:.4f}")

# Top 5 by eigenvector centrality
top_5_eigen = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 usuários por centralidade de autovetor:")
for node, centrality in top_5_eigen:
    print(f"Nó: {node}, Centralidade de autovetor: {centrality:.4f}")

# Remove edges with lower than 1 degree for performance and better visualization
low_degree_nodes = [n for n, d in G.degree() if d <= 5]
G.remove_nodes_from(low_degree_nodes)
print("Número de vértices:", G.number_of_nodes())
print("Número de arestas:", G.number_of_edges())


# Scaling node size
scaler = MinMaxScaler(feature_range=(5, 150))
deg_vals = np.array([degree_centrality[n] for n in G.nodes()]).reshape(-1, 1)
eig_vals = np.array([eigenvector_centrality[n] for n in G.nodes()]).reshape(-1, 1)
degree_sizes = scaler.fit_transform(deg_vals).flatten()
eigen_sizes = scaler.fit_transform(eig_vals).flatten()
print("scaling done")

# Layout (kamada_kawai or spring)
pos = nx.spring_layout(G, k=1.5, iterations=200, seed=42)
print("positioning done")

# Degree plot
plt.figure(figsize=(14, 10))
nx.draw_networkx_nodes(G, pos, node_size=degree_sizes, alpha=0.8, node_color='teal')
print("drawn nodes")
nx.draw_networkx_edges(G, pos, alpha=0.05, edge_color='gray', width=0.3)
print("drawn edges")
plt.title("Tamanhos proporcionais à centralidade de grau")
plt.axis('off')
plt.show()

# Eigenvector plot
plt.figure(figsize=(14, 10))
nx.draw_networkx_nodes(G, pos, node_size=eigen_sizes, alpha=0.8, node_color='royalblue')
nx.draw_networkx_edges(G, pos, alpha=0.05, edge_color='gray', width=0.3)
plt.title("Tamanhos proporcionais à centralidade de autovetor")
plt.axis('off')
plt.show()

In [None]:
# Plotting subgraphs

k = 300
# Top k degree graph
top_nodes_degree = sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:k]
G_top_degree = G.subgraph(top_nodes_degree)

# Top K eigenvector graph
top_nodes_eigenvector = sorted(eigenvector_centrality, key=eigenvector_centrality.get, reverse=True)[:k]
G_top_eigenvector = G.subgraph(top_nodes_eigenvector)

def scale_sizes(subgraph, centrality_dict):
    scaler = MinMaxScaler(feature_range=(5, 150))
    centrality_vals = np.array([centrality_dict[n] for n in subgraph.nodes()]).reshape(-1, 1)
    scaled_sizes = scaler.fit_transform(centrality_vals).flatten()
    return scaled_sizes

# degree plot
degree_sizes_sub = scale_sizes(G_top_degree, degree_centrality)
# pos_degree = nx.kamada_kawai_layout(G_top_degree)
pos_degree = nx.spring_layout(G_top_degree, k=0.5, iterations=200, seed=42)

plt.figure(figsize=(14, 10))
nx.draw_networkx_nodes(G_top_degree, pos_degree, node_size=degree_sizes_sub, alpha=0.8, node_color='teal')
nx.draw_networkx_edges(G_top_degree, pos_degree, alpha=0.1, edge_color='gray', width=0.4)
plt.title(f"Top {k} Nós - Centralidade de Grau")
plt.axis('off')
plt.show()

# eigenvector plot
eigen_sizes_sub = scale_sizes(G_top_eigenvector, eigenvector_centrality)
# pos_eigen = nx.kamada_kawai_layout(G_top_eigenvector)
pos_eigen = nx.spring_layout(G_top_eigenvector, k=0.5, iterations=200, seed=42)

plt.figure(figsize=(14, 10))
nx.draw_networkx_nodes(G_top_eigenvector, pos_eigen, node_size=eigen_sizes_sub, alpha=0.8, node_color='royalblue')
nx.draw_networkx_edges(G_top_eigenvector, pos_eigen, alpha=0.1, edge_color='gray', width=0.4)
plt.title(f"Top {k} Nós - Centralidade de Autovetor")
plt.axis('off')
plt.show()

In [None]:
# Collecting data from users in our network
import networkx as nx
import json

G = nx.read_graphml("github_mutual_follow_graph.graphml")
top_users = sorted(G.degree, key=lambda x: x[1], reverse=True)[15001:17500]
top_usernames = [user for user, _ in top_users]

collected_data = {}
i=1
for username in top_usernames:
    print(i)
    i+=1
    try:
        print(f"Fetching data for: {username}")
        user_response = request(f"users/{username}")
        repos_response = request(f"users/{username}/repos?per_page=100")
        user_data = user_response.json()
        repos_data = repos_response.json()
        collected_data[username] = {
            "login": user_data.get("login"),
            "name": user_data.get("name"),
            "followers": user_data.get("followers"),
            "following": user_data.get("following"),
            "location": user_data.get("location"),
            "created_at": user_data.get("created_at"),
            "company": user_data.get("company"),
            "bio": user_data.get("bio"),
            "public_repos": user_data.get("public_repos"),
            "repos": repos_data
        }
    except Exception as e:
        print(f"Failed to fetch data for {username}: {e}")

with open("github_user_data_15000_17500.json", "w", encoding="utf-8") as f:
    json.dump(collected_data, f, indent=2, ensure_ascii=False)

In [None]:
# Turning the Data into tabular numerical attributes
import json
import csv
import os
from collections import Counter
from datetime import datetime

# Load all JSON files matching the pattern
all_files = [f for f in os.listdir() if f.startswith("github_user_data_") and f.endswith(".json")]
raw_data = {}

print(len(all_files))

for file in all_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        raw_data.update(data)

print(f"Total users loaded: {len(raw_data)}")

language_user_counts = Counter()
temp_processed_data = []

for username, user in raw_data.items():
    try:
        repos = user.get("repos", [])
        created_at = user.get("created_at")

        if not created_at:
            continue

        created_year = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ").year
        time_in_years = 2025 - created_year

        langs = [repo["language"] for repo in repos if repo.get("language")]
        lang_counter = Counter(langs)

        # Count how many users used each language
        for lang in lang_counter:
            language_user_counts[lang.lower()] += 1

        total_stars = sum(repo.get("stargazers_count", 0) for repo in repos)
        average_stars = total_stars / len(repos) if repos else 0

        user_row = {
            "login": user.get("login"),
            "followers": user.get("followers"),
            "following": user.get("following"),
            "company": int(user.get("company") is not None),
            "has_bio": int(user.get("bio") is not None),
            "time_in_years": time_in_years,
            "public_repos": user.get("public_repos"),
            "average_stars": round(average_stars, 2),
            "languages_used": len(lang_counter)
        }

        user_row["_lang_counter"] = {lang.lower(): count for lang, count in lang_counter.items()}
        temp_processed_data.append(user_row)

    except Exception as e:
        print(f"Error processing {username}: {e}")

# Keep only languages used by at least 50 users
valid_languages = {lang for lang, count in language_user_counts.items() if count >= 50}

processed_data = []
for user in temp_processed_data:
    lang_counts = user.pop("_lang_counter")
    for lang in valid_languages:
        user[lang] = lang_counts.get(lang, 0)
    processed_data.append(user)

# Write to CSV
base_fields = [
    "login", "followers", "following",
    "company", "has_bio", "time_in_years",
    "public_repos", "average_stars", "languages_used"
]
language_fields = sorted(valid_languages)
fieldnames = base_fields + language_fields

output_file = "github_user_tab_data.csv"
with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(processed_data)



In [None]:
# Base Stats and Plots for the base attributes
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

df = pd.read_csv("github_user_tab_data.csv")

base_fields = [
    "followers", "following", "company", "has_bio",
    "time_in_years", "public_repos", "average_stars", "languages_used"
]

os.makedirs("docs2", exist_ok=True)

stats = df[base_fields].describe().transpose()
stats["median"] = df[base_fields].median()
stats = stats[["min", "max", "mean", "median", "std"]]
stats.to_csv("docs2/github_user_base_stats.csv")

for col in base_fields:
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f"Histogram of {col}")

    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")

    plt.tight_layout()
    plt.savefig(f"docs2/{col}_distribution.png")
    plt.close()


In [None]:
# Correlation Heatmap for our base fields
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs("docs2", exist_ok=True)

base_fields = [
    "followers", "following",
    "company", "has_bio", "time_in_years",
    "public_repos", "average_stars", "languages_used"
]

df = pd.read_csv("github_user_tab_data.csv")
df_base = df[base_fields].copy()

corr = df_base.corr()
corr.to_csv("docs2/correlation_matrix.csv")

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap of Base Fields")
plt.tight_layout()
plt.savefig("docs2/correlation_heatmap.png")
plt.close()


In [None]:
# Normalizing the values from 0 to 1
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

base_fields = [
    "followers", "following",
    "company", "has_bio", "time_in_years",
    "public_repos", "average_stars", "languages_used"
]

df = pd.read_csv("github_user_tab_data.csv")
login_col = df["login"]
df_to_scale = df[base_fields]

scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_to_scale), columns=base_fields)

df_final = pd.concat([login_col, df_scaled], axis=1)
df_final.to_csv("github_user_tab_data_normalized.csv", index=False)


In [None]:
# Elbow method to choose best K
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd

df = pd.read_csv("github_user_tab_data_normalized.csv")
X = df.drop(columns=["login"])

inertias = []
k_values = range(2, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_values, inertias, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.tight_layout()
plt.savefig('docs2/kmeans_elbow_plot.png')
plt.close()



In [None]:
# Clustering data and visualizing it
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import seaborn as sns

df = pd.read_csv("github_user_tab_data_normalized.csv")

feature_columns = df.columns.drop('login')
X = df[feature_columns]

k = 4 #choosen through elbow method
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X)

df[['login', 'cluster']].to_csv('user_clusters.csv', index=False)

print(f"Silhouette Score: {silhouette_score(X, df['cluster'])}")

pca = PCA(n_components=2, random_state=42)
components = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=df['cluster'], palette='tab10', s=30)
plt.title('User Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.tight_layout()
plt.savefig('docs2/user_clusters_pca.png')
plt.close()



In [None]:
import pandas as pd
import networkx as nx
from collections import defaultdict
from math import comb

G = nx.read_graphml("github_mutual_follow_graph.graphml")

df_clusters = pd.read_csv("user_clusters.csv")
user_cluster = dict(zip(df_clusters['login'], df_clusters['cluster']))

cluster_edge_counts = defaultdict(int)
cluster_possible_edges = defaultdict(int)
total_edges = G.number_of_edges()

for u, v in G.edges():
    cluster_u = user_cluster.get(u)
    cluster_v = user_cluster.get(v)
    if cluster_u is not None and cluster_v is not None:
        if cluster_u == cluster_v:
            cluster_edge_counts[cluster_u] += 1

k = df_clusters['cluster'].nunique()

for cluster in range(k):
    nodes_in_cluster = [node for node, c in user_cluster.items() if c == cluster]
    n = len(nodes_in_cluster)
    cluster_possible_edges[cluster] = comb(n, 2) if n >= 2 else 0

print("\nIntra-cluster edge density:")
for cluster in range(k):
    edges = cluster_edge_counts[cluster]
    possible = cluster_possible_edges[cluster]
    density = edges / possible if possible > 0 else 0
    print(f"Cluster {cluster}: {edges} edges out of {possible} possible (density={density:.4f})")

graph_density = total_edges / comb(G.number_of_nodes(), 2)
print(f"\nGraph-wide edge density: {graph_density:.4f}")

for cluster in range(k):
    edges = cluster_edge_counts[cluster]
    possible = cluster_possible_edges[cluster]
    density = edges / possible if possible > 0 else 0
    density_ratio = density / graph_density if graph_density > 0 else 0
    print(f"{cluster}\t{edges}\t{possible}\t{density:.6f}\t{density_ratio:.2f}x")