In [8]:
import networkx as nx
from utility_funcs import readcirclefile, read_nodeadjlist, cost_function
import os
import sklearn.cluster
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
from node2vec import Node2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import deepwalk


root_dir = Path.cwd().resolve()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def readfeaturelist(filename):
    """
    reads a featurelist file and returns a list of the feature names
    """
    with open(filename) as f:
        out = []        # list of feature names
        for line in f:
            out.append(line.strip())
        return sorted(out)

In [9]:
features = readfeaturelist(root_dir / 'featureList.txt')
print( features)
print(len(features))

['birthday', 'education;classes;description', 'education;classes;from;id', 'education;classes;from;name', 'education;classes;id', 'education;classes;name', 'education;classes;with;id', 'education;classes;with;name', 'education;concentration;id', 'education;concentration;name', 'education;degree;id', 'education;degree;name', 'education;school;id', 'education;school;name', 'education;type', 'education;with;id', 'education;with;name', 'education;year;id', 'education;year;name', 'first_name', 'gender', 'hometown;id', 'hometown;name', 'id', 'languages;id', 'languages;name', 'last_name', 'locale', 'location', 'location;id', 'location;name', 'middle_name', 'name', 'political', 'religion', 'work;description', 'work;employer;id', 'work;employer;name', 'work;end_date', 'work;from;id', 'work;from;name', 'work;location;id', 'work;location;name', 'work;position;id', 'work;position;name', 'work;projects;description', 'work;projects;end_date', 'work;projects;from;id', 'work;projects;from;name', 'work

In [3]:
def readfeatures(featurefile):
    """
    reads a featurefile consisting of userid feature;value feature;value
    returns a list where index is user id, elements are dictionaries 
    of features as keys pointing to list of values maybe should be sets
    """
    with open(featurefile) as f:
        out = [] 
        for line in f:
            tokens = line.split()
            profile = {}  # empty profile for the user
            for tok in tokens[1:]:
                feature,val = tok.rsplit(';',1)
                val = int(val)
                if feature not in profile:
                    profile[feature]=set([val])
                else:
                    profile[feature].add(val)
            out.append( profile )
        for i in range(len(out)):
            assert out[i]['id'] == set([i])  # check that each line was read and placed in the correct place in the list
        return out

In [4]:
def convert_profile_dict_to_vector(profile,features):
    out = []
    for feature in features:
        if feature in profile:
            out.append(profile[feature])
        else:
            out.append(set())
    return out

In [5]:
def match_vector(profile1,profile2):
    return [len(x.intersection(y)) for x,y in zip(profile1,profile2)]
     

def generate_feature_matrix(profiles_dict,ego,G):
    return [match_vector(profiles_dict[ego], profiles_dict[g]) for g in G.nodes()]
     

def generate_class_matrix(G,true_circles):
    return dict(zip(true_circles.keys(),[[int(g in circle) for g in G.nodes()] for circle in true_circles.values()]))
     

In [None]:
profiles_dict = readfeatures('features.txt')

profile_matrix = [convert_profile_dict_to_vector(profile,features) for profile in profiles_dict]

In [None]:
import matplotlib.pyplot as plt

In [None]:
ego = 345
true_circles = readcirclefile('./Training/'+str(ego)+'.circles')
G = read_nodeadjlist('./egonets/'+str(ego)+'.egonet')
print('Total friends:', len(G.nodes()))
class_matrix = generate_class_matrix(G,true_circles)
feature_matrix = generate_feature_matrix(profile_matrix,ego,G)

for label,circle in class_matrix.items():
    print('Training Ego:', ego,'Circle:',label,'...')
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit( feature_matrix, circle )
    important_features = sorted(zip(features,forest.feature_importances_), key=lambda x: x[1], reverse=True)
    importance_scores = [val for key,val in important_features]
    importance_labels = [key for key,val in important_features]
    ind = range(len(importance_scores))
    plt.bar(ind, forest.feature_importances_)
    plt.axis([min(ind), max(ind), 0, 0.7])
    plt.show()

In [None]:
dict(zip(true_circles.keys(),[[int(g in circle) for g in G.nodes()] for circle in true_circles.values()]))


In [None]:
true_circles = readcirclefile('./Training/'+str(ego)+'.circles')
G = read_nodeadjlist('./egonets/'+str(ego)+'.egonet')
print('Total friends:', len(G.nodes()))
class_matrix = generate_class_matrix(G,true_circles)
feature_matrix = generate_feature_matrix(profile_matrix,ego,G)

for label,circle in class_matrix.items():
    print('Training Ego:', ego,'Circle:',label,'...')
    forest = RandomForestClassifier(n_estimators=100)
    forest = forest.fit( feature_matrix, circle )
    important_features = sorted(zip(features,forest.feature_importances_), key=lambda x: x[1], reverse=True)
    importance_scores = [val for key,val in important_features]
    importance_labels = [key for key,val in important_features]
    ind = range(len(importance_scores))
    plt.bar(ind, forest.feature_importances_)
    plt.axis([min(ind), max(ind), 0, 0.7])
    plt.show()

In [None]:
trainingfiles = os.listdir('./Training/')

df_labels = ['Ego','Circle']+features
characteristic_profiles = []

for item in trainingfiles:
    ego = int((item.split('.')[0]))
    true_circles = readcirclefile('./Training/'+item)
    G = read_nodeadjlist('./egonets/'+str(ego)+'.egonet')
    class_matrix = generate_class_matrix(G,true_circles)
    feature_matrix = generate_feature_matrix(profile_matrix,ego,G)
    
    for label,circle in class_matrix.items():
        print('Training Ego:', ego,'Circle:',label,'...')
        forest = RandomForestClassifier(n_estimators=100)
        forest = forest.fit( feature_matrix, circle )
        characteristic_profiles.append([ego]+[label]+list(forest.feature_importances_))

df = pd.DataFrame(data=characteristic_profiles,columns=df_labels)

In [None]:
df

In [None]:
df.mean().sort_values() 

In [None]:
df.to_csv('characterist_profiles.csv')

In [None]:
df_pos = df[df.min(axis=1)>=0]
df_neg = df[df.min(axis=1)<0]

In [None]:
df_pos_mean = df_pos.mean()
df_pos_mean.sort_values(ascending=False)
rand_chance = (len(df_pos_mean)-2)
df_pos_mean[df_pos_mean.gt(1./(rand_chance))]*rand_chance

In [None]:
df_pos_mean*rand_chance

In [None]:
df_pos.mean()[2:]

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
profiles_dict = readfeatures('features.txt')

specific_profile = {}
labels = ['locale', 'education;school;name', 'education;school;id', 'last_name']
for profile in profiles_dict:
    id = profile['id'].pop()
    inisde_dict = {}
    for spec in labels:
        inisde_dict[spec] = profile.get(spec, set({-1})).pop()
    specific_profile[id] = inisde_dict


profile_matrix = [convert_profile_dict_to_vector(profile,features) for profile in profiles_dict]

# The most impactful features are as follows:
# work;employer;id                     0.019194
# work;employer;name                   0.026036
# hometown;name                        0.026805
# hometown;id                          0.028106
# work;start_date                      0.028649
# location;id                          0.031029
# location;name                        0.031354
# education;year;id                    0.033277
# education;year;name                  0.034244
# last_name                            0.034566
# education;school;id                  0.052966
# education;school;name                0.056747
# locale                               0.133719
# gender                               0.173660
# education;type                       0.184633

# [print(profile['education;type']) for profile in profiles_dict]
trainingfiles = os.listdir('./Training/')

edges = []
for item in trainingfiles:
    ego = int((item.split('.')[0]))
    true_circles = readcirclefile('./Training/'+item)
    # print(true_circles)
    for key in true_circles.keys():
        values = true_circles[key]
        for value in values:
            edges.append((key, value))


# Initialize an undirected graph using NetworkX
G = nx.Graph()
G.add_edges_from(edges)

# Apply DeepWalk to generate embeddings for each node (user)
node2vec = Node2Vec(G, dimensions=64, walk_length=50, num_walks=200, workers=1000)
model = node2vec.fit()

# Extract embeddings for each node (user)
embeddings = np.array([model.wv[str(i)] for i in G.nodes()])
edges = list(G.edges())
non_edges = list(nx.non_edges(G))

user_combined_features = []
train_data = []

for user_id in G.nodes():
    combined_features = np.concatenate([embeddings[user_id], [specific_profile[user_id]['locale'], specific_profile[user_id]['locale'], specific_profile[user_id]['locale']]])
    user_combined_features.append(combined_features)

# Add positive pairs (edges)
for edge in edges:
    user1, user2 = edge
    label = 1  # They are friends
    features = np.concatenate([user_combined_features[user1], user_combined_features[user2]])
    train_data.append((features, label))

# Add negative pairs (non-edges)
for non_edge in non_edges:
    user1, user2 = non_edge
    label = 0  # They are not friends
    features = np.concatenate([user_combined_features[user1], user_combined_features[user2]])
    train_data.append((features, label))

# Split data into features and labels
X = np.array([data[0] for data in train_data])
y = np.array([data[1] for data in train_data])

# Train a classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate model
accuracy = clf.score(X_test, y_test)
print(f"Classifier Accuracy: {accuracy:.4f}")

# Predicting friendship for a pair of users
prediction = clf.predict([np.concatenate([user_combined_features[user1], user_combined_features[user2]])])
print(f"Are users {user1} and {user2} friends? {'Yes' if prediction[0] == 1 else 'No'}")


Computing transition probabilities: 100%|██████████| 11812/11812 [00:03<00:00, 3046.68it/s]


IndexError: index 14829 is out of bounds for axis 0 with size 11812