In [1]:
import networkx as nx
from utility_funcs import readcirclefile, read_nodeadjlist, readfeaturelist, readfeatures, convert_profile_dict_to_vector, match_vector, generate_feature_matrix, generate_class_matrix
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import numpy as np

root_dir = Path.cwd().resolve()
data_dir = root_dir / 'data'

In [2]:
features = readfeaturelist(data_dir / 'featureList.txt')
print( features)
print(len(features))

['birthday', 'education;classes;description', 'education;classes;from;id', 'education;classes;from;name', 'education;classes;id', 'education;classes;name', 'education;classes;with;id', 'education;classes;with;name', 'education;concentration;id', 'education;concentration;name', 'education;degree;id', 'education;degree;name', 'education;school;id', 'education;school;name', 'education;type', 'education;with;id', 'education;with;name', 'education;year;id', 'education;year;name', 'first_name', 'gender', 'hometown;id', 'hometown;name', 'id', 'languages;id', 'languages;name', 'last_name', 'locale', 'location', 'location;id', 'location;name', 'middle_name', 'name', 'political', 'religion', 'work;description', 'work;employer;id', 'work;employer;name', 'work;end_date', 'work;from;id', 'work;from;name', 'work;location;id', 'work;location;name', 'work;position;id', 'work;position;name', 'work;projects;description', 'work;projects;end_date', 'work;projects;from;id', 'work;projects;from;name', 'work

In [None]:
profiles_dict = readfeatures(data_dir / 'features.txt')
profile_matrix = [convert_profile_dict_to_vector(profile,features) for profile in profiles_dict]

In [None]:
ego = 345
true_circles = readcirclefile(data_dir / 'Training'/ f"{str(ego)}.circles")
G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")
print('Total friends:', len(G.nodes()))
class_matrix = generate_class_matrix(G,true_circles)
feature_matrix = generate_feature_matrix(profile_matrix,ego,G)

for label,circle in class_matrix.items():
    print('Training Ego:', ego,'Circle:',label,'...')
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit( feature_matrix, circle )
    important_features = sorted(zip(features,forest.feature_importances_), key=lambda x: x[1], reverse=True)
    importance_scores = [val for key,val in important_features]
    importance_labels = [key for key,val in important_features]
    ind = range(len(importance_scores))
    plt.bar(ind, forest.feature_importances_)
    plt.title("Feature Importance in Social Circle Prediction", fontsize=14)
    plt.xlabel("Feature Names", fontsize=12)
    plt.ylabel("Importance Score", fontsize=12)
    plt.axis([min(ind), max(ind), 0, 0.7])
    plt.show()

In [None]:
print("Total number of nodes in Graph 1:", len(G.nodes()))
print("Total number of edges in Graph 1:", len(G.edges()))

In [None]:
ego = 345
true_circles = readcirclefile(data_dir / 'Training'/ f"{str(ego)}.circles")
G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")
print('Total friends:', len(G.nodes()))

class_matrix = generate_class_matrix(G, true_circles)
feature_matrix = generate_feature_matrix(profile_matrix, ego, G)

for label, circle in class_matrix.items():
    print('Training Ego:', ego, 'Circle:', label, '...')
    
    # Train the Random Forest Classifier
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit(feature_matrix, circle)

    # Sort features by importance
    important_features = sorted(zip(features, forest.feature_importances_), key=lambda x: x[1], reverse=True)
    importance_scores = [val for key, val in important_features]
    importance_labels = [key for key, val in important_features]  

    ind = range(len(importance_scores))

    plt.figure(figsize=(15,5))  
    plt.bar(ind, importance_scores, color="skyblue")  
    
    plt.title("Feature Importance in Social Circle Prediction", fontsize=14)
    plt.xlabel("Feature Names", fontsize=12)
    plt.ylabel("Importance Score", fontsize=12)

    plt.xticks(ind, importance_labels, rotation=45, ha="right", fontsize=10)

    plt.ylim(0, 0.7)  
    plt.show()

In [None]:
ego = 345  # Choose ego user

true_circles = readcirclefile(data_dir / 'Training'/ f"{str(ego)}.circles")
G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")

class_matrix = generate_class_matrix(G, true_circles)
feature_matrix = generate_feature_matrix(profile_matrix, ego, G)

feature_matrix = np.array(feature_matrix)  # Ensure it's a structured 2D array

num_features = feature_matrix.shape[1]  # Now feature_matrix is an array, so .shape[1] works
feature_importances = np.zeros(num_features)

for label, circle in class_matrix.items():
    circle = np.array(circle)  # Ensure labels are also in NumPy format
    forest = RandomForestClassifier(n_estimators=100)
    forest.fit(feature_matrix, circle)
    feature_importances += forest.feature_importances_  # Add feature importance values

feature_importances /= len(class_matrix)

sorted_indices = np.argsort(feature_importances)[::-1]  # Sort in descending order
sorted_importances = feature_importances[sorted_indices]
sorted_labels = np.array(features)[sorted_indices]  # Sort feature names accordingly

plt.figure(figsize=(20, 6))  
plt.bar(sorted_labels, sorted_importances, color="skyblue")

plt.title("Overall Feature Importance Across All Social Circles", fontsize=14)
plt.xlabel("Feature Names", fontsize=12)
plt.ylabel("Importance Score", fontsize=12)

plt.xticks(rotation=45, ha="right", fontsize=10)  # Rotate labels for readability
plt.ylim(0, max(sorted_importances) * 1.1)  # Adjust y-axis for visibility

plt.show()

In [None]:
print("Available Circles for Ego:", list(class_matrix.keys()))

In [None]:
# Assuming 'forest.feature_importances_' contains feature importance scores
ind = np.arange(len(forest.feature_importances_))  # Indices for bars

plt.figure(figsize=(30,5))  # Set figure size

plt.bar(ind, forest.feature_importances_, color='skyblue')  # Improved color
plt.xticks(ind, features, rotation=45, ha="right", fontsize=10)  

plt.title("Feature Importance in Social Circle Prediction", fontsize=14)
plt.xlabel("Feature Names", fontsize=12)
plt.ylabel("Importance Score", fontsize=12)

plt.ylim(0, 0.7)  # Keep the original axis range

plt.show()

In [None]:
dict(zip(true_circles.keys(),[[int(g in circle) for g in G.nodes()] for circle in true_circles.values()]))


In [None]:
true_circles = readcirclefile(data_dir / 'Training'/ f"{str(ego)}.circles")
G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")
print('Total friends:', len(G.nodes()))
class_matrix = generate_class_matrix(G,true_circles)
feature_matrix = generate_feature_matrix(profile_matrix,ego,G)

for label,circle in class_matrix.items():
    print('Training Ego:', ego,'Circle:',label,'...')
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit( feature_matrix, circle )
    important_features = sorted(zip(features,forest.feature_importances_), key=lambda x: x[1], reverse=True)
    importance_scores = [val for key,val in important_features]
    importance_labels = [key for key,val in important_features]
    ind = range(len(importance_scores))
    plt.bar(ind, forest.feature_importances_)
    plt.axis([min(ind), max(ind), 0, 0.7])
    plt.show()

In [None]:
ego = 345

G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")

G.add_node(ego)  # Ensure Ego node exists
for friend in G.nodes():
    G.add_edge(ego, friend)  # Connect ego to all its friends

ego_subgraph = G.subgraph([ego] + list(G.neighbors(ego)))

pos = nx.spring_layout(ego_subgraph, k=0.8)  # k=0.8 spreads out nodes

#plt.figure(figsize=(8, 6))
nx.draw(
    ego_subgraph,
    pos,
    node_size=300,  # Adjusted for visibility
    node_color="skyblue",  # Friend nodes in blue
    edge_color="gray",
    alpha=0.7,  # Light transparency to reduce clutter
    with_labels=True,
    font_size=9
)

nx.draw_networkx_nodes(ego_subgraph, pos, nodelist=[ego], node_size=500, node_color="red")

plt.title("Ego 345’s Social Circle", fontsize=14)
plt.show()

In [None]:
friends = list(G.neighbors(ego))[:10]  # Print only the first 10 friends
print("First 10 friends of Ego 345:", friends)

In [None]:
ego = 345  # Define the Ego User

G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")

G.add_node(ego)  # Add ego node if missing
for friend in G.nodes():
    G.add_edge(ego, friend)  # Connect ego to friends

# ONLY 10 direct friends for clarity
selected_friends = list(G.neighbors(ego))[:10]  # First 10 friends
ego_subgraph = G.subgraph([ego] + selected_friends)

pos = nx.spring_layout(ego_subgraph, k=1.2)  # Increase spacing

#plt.figure(figsize=(8, 6))

nx.draw(
    ego_subgraph, 
    pos, 
    node_size=300,  # Medium-sized nodes
    node_color="skyblue",  # Friends in blue
    edge_color="gray",
    alpha=0.7,  
    with_labels=True,  
    font_size=10
)

nx.draw_networkx_nodes(ego_subgraph, pos, nodelist=[ego], node_size=500, node_color="red")

plt.title("Ego 345’s Social Circle (First 10 Direct Friends)", fontsize=14)
plt.show()

In [None]:
trainingfiles = os.listdir(data_dir / 'Training')

df_labels = ['Ego','Circle']+features
characteristic_profiles = []

for item in trainingfiles:
    ego = int((item.split('.')[0]))
    true_circles = readcirclefile(data_dir / 'Training' / item)
    G = read_nodeadjlist(data_dir / 'egonets' / f"{str(ego)}.egonet")
    class_matrix = generate_class_matrix(G,true_circles)
    feature_matrix = generate_feature_matrix(profile_matrix,ego,G)
    
    for label,circle in class_matrix.items():
        print('Training Ego:', ego,'Circle:',label,'...')
        forest = RandomForestClassifier(n_estimators=100)
        forest = forest.fit( feature_matrix, circle )
        characteristic_profiles.append([ego]+[label]+list(forest.feature_importances_))

df = pd.DataFrame(data=characteristic_profiles,columns=df_labels)

In [None]:
df

In [None]:
df.mean().sort_values(ascending=False)

In [None]:
df.to_csv('characterist_profiles.csv')

In [None]:
df_pos = df[df.min(axis=1)>=0]
df_neg = df[df.min(axis=1)<0]
df_pos_mean = df_pos.mean()
df_pos_mean.sort_values(ascending=False)
rand_chance = (len(df_pos_mean)-2)
df_pos_mean[df_pos_mean.gt(1./(rand_chance))]*rand_chance
df_pos_mean*rand_chance
print("Most important features sorted:")
df_pos.mean()[2:].sort_values(ascending=False)