In [2]:
import pandas as pd
import numpy as np

In [200]:
import json

with open('../../nodes2.json', 'r', encoding='utf-8') as f:
    json_file = json.load(f)

nodes2 = pd.json_normalize(json_file['people'])

In [208]:
edges2 = pd.read_csv('../../edges.csv')
sampled_edges2 = edges2.drop_duplicates(subset='target_user_id')
set_ids2 = list(set(sampled_edges2['target_user_id']) | set(sampled_edges2['interactor_id']))

In [209]:
friendship2 = pd.DataFrame(json_file['edges']).drop(columns=[2]).rename(columns={0: 'user_id', 1: 'friend_id'})

In [211]:
friendship_set = set(friendship2.apply(lambda row: frozenset([row['user_id'], row['friend_id']]), axis=1))


def is_friend_pair(row):
    pair = frozenset([row['target_user_id'], row['interactor_id']])
    return pair in friendship_set


edges2['is_friend'] = edges2.apply(is_friend_pair, axis=1)

In [255]:
valid_users = set(friendship2['user_id']) | set(friendship2['friend_id'])

edges2= edges2[
    edges2['target_user_id'].isin(valid_users) &
    edges2['interactor_id'].isin(valid_users)
].reset_index(drop=True)

In [265]:
nodes2

Unnamed: 0,id,first_name,last_name,domain,photo_file,photo_url,meta.bdate,meta.can_see_audio,meta.interests,meta.books,...,meta.status_audio.short_videos_allowed,meta.status_audio.stories_allowed,meta.status_audio.stories_cover_allowed,meta.status_audio.release_audio_id,meta.status_audio.subtitle,meta.status_audio.no_search,meta.status_audio.featured_artists,meta.status_audio.legal_notices_type,meta.status_audio.genre_id,meta.status_audio.album_id
0,530188288,Gleb,Volkin,id530188288,,https://sun1-16.userapi.com/s/v1/ig2/DGlicXTkB...,11.3.2007,1.0,,,...,,,,,,,,,,
1,32130083,Pasha,Evmenov,ewmenow,,https://sun1-24.userapi.com/s/v1/ig2/medM7uNml...,2.12,0.0,,,...,,,,,,,,,,
2,78190777,Alyona,Funtanina,id78190777,,https://sun1-89.userapi.com/s/v1/ig2/XgwLU-8-V...,15.10.1986,1.0,,,...,,,,,,,,,,
3,103354795,Denis,Lukomsky,denislukom,,https://sun1-27.userapi.com/s/v1/ig2/ze8H88HVS...,5.12.2002,1.0,,,...,,,,,,,,,,
4,110610897,Sladon,Komposter,id110610897,,https://sun1-98.userapi.com/s/v1/ig2/0fERp9xm6...,22.8.1997,0.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100980,376421110,Igor,Kaspruk,id376421110,,https://sun1-84.userapi.com/s/v1/ig1/sg6MA0dUB...,27.7.1961,1.0,,,...,,,,,,,,,,
100981,387007491,Israel,Work,id387007491,,https://sun1-19.userapi.com/s/v1/ig1/0xCs6Ovl9...,,1.0,,,...,,,,,,,,,,
100982,391949446,Arm,Armenia,id391949446,,https://sun1-95.userapi.com/impg/DW4IDqvukChyc...,11.11.1986,1.0,,,...,,,,,,,,,,
100983,402101271,Veronika,Mostova,id402101271,,https://sun1-97.userapi.com/impg/_nb2NhfE_jv_q...,27.8.1990,,,,...,,,,,,,,,,


In [257]:
from collections import defaultdict

friends_dict = defaultdict(set)
for _, row in friendship2.iterrows():
    friends_dict[row['user_id']].add(row['friend_id'])
    friends_dict[row['friend_id']].add(row['user_id'])


def count_common_friends(row):
    u1 = row['target_user_id']
    u2 = row['interactor_id']
    return len(friends_dict[u1] & friends_dict[u2])


edges2['common_friends_count'] = edges2.apply(count_common_friends, axis=1)

edges2['user1'] = edges2[['target_user_id', 'interactor_id']].min(axis=1)
edges2['user2'] = edges2[['target_user_id', 'interactor_id']].max(axis=1)
aggregated = edges2.groupby(['user1', 'user2']).agg({
    'likes_count': 'sum',
    'comments_count': 'sum',
    'common_friends_count': 'first',
    'is_friend': 'max'
}).reset_index()



aggregated

Unnamed: 0,user1,user2,likes_count,comments_count,common_friends_count,is_friend
0,104206,30587838,1,0,0,True
1,126049,657252850,5,0,0,True
2,173080,116869195,1,0,0,True
3,204506,116869195,3,0,0,True
4,222197,116869195,2,0,0,True
...,...,...,...,...,...,...
7007,826979631,826979631,2,0,1,False
7008,828157101,828157101,0,1,3,False
7009,828331048,828331048,4,0,184,False
7010,828331048,831749173,5,0,0,True


In [264]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from xgboost import XGBClassifier


feature_columns = ['likes_count', 'comments_count', 'common_friends_count']

X = aggregated[feature_columns].fillna(0)
y = aggregated['is_friend'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = RandomForestClassifier(n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1)
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model_xgb = XGBClassifier(
        scale_pos_weight=pos_weight,
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    )
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.71      0.81       871
           1       0.66      0.93      0.77       532

    accuracy                           0.79      1403
   macro avg       0.80      0.82      0.79      1403
weighted avg       0.84      0.79      0.80      1403


Confusion Matrix:
[[615 256]
 [ 35 497]]


In [4]:
nodes = nodes.dropna(subset=['first_name'])

In [5]:
nodes = nodes.drop_duplicates(subset=['id'])

In [6]:
nodes

Unnamed: 0,id,first_name,last_name,domain,photo_file,photo_url,meta.city.id,meta.city.title,meta.can_see_audio,meta.interests,...,meta.status_audio.owner_id,meta.status_audio.title,meta.status_audio.duration,meta.status_audio.is_explicit,meta.status_audio.is_focus_track,meta.status_audio.track_code,meta.status_audio.url,meta.status_audio.stream_duration,meta.status_audio.date,meta.status_audio.genre_id
0,179301939,Lyubov,Novoselova,id179301939,,https://sun1-95.userapi.com/impg/DW4IDqvukChyc...,501.0,Sarapul,1.0,,...,,,,,,,,,,
2,327327093,Anzhela,Levkina,id327327093,,https://sun1-16.userapi.com/s/v1/ig1/VNiVPfPXI...,281.0,Brest,1.0,,...,,,,,,,,,,
4,288357777,Vasily,Volkov,id288357777,,https://sun1-24.userapi.com/s/v1/ig1/y6Dnjgjvq...,1032.0,Polotsk,1.0,,...,,,,,,,,,,
5,55846307,Pashka,Polyakov,id55846307,,https://sun1-18.userapi.com/s/v1/ig1/xdvoQkERj...,1463.0,Zelenograd,1.0,,...,,,,,,,,,,
8,210288,Yana,Vasilyeva,dabasssista,,https://sun1-23.userapi.com/s/v1/ig2/Z4emj1vbY...,1.0,Moscow,1.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8346,59192221,Gulshat,Ilshatovna,id59192221,,https://sun1-20.userapi.com/s/v1/ig2/5X_P4-WTP...,,,0.0,,...,,,,,,,,,,
8347,59259047,Lera,Shukaylo,inkognito___vl,,https://sun1-56.userapi.com/s/v1/ig2/7ZzD51eFE...,88.0,Naberezhnye Chelny,1.0,,...,,,,,,,,,,
8348,59285071,Gulia,Nizamutdinova,id59285071,,https://sun1-26.userapi.com/s/v1/ig2/Voh4hHi_y...,88.0,Naberezhnye Chelny,1.0,,...,,,,,,,,,,
8349,60031769,Vlad,Mitenkov,id60031769,,https://sun1-97.userapi.com/impg/_nb2NhfE_jv_q...,49.0,Yekaterinburg,,,...,,,,,,,,,,


In [7]:
edges = pd.read_csv('../../edges.csv')

In [8]:
edges

Unnamed: 0,target_user_id,interactor_id,likes_count,comments_count,reposts_count
0,260762184,189326838,1,0,0
1,260762184,260762184,1,0,0
2,260762184,370224313,1,0,0
3,260762184,400282413,1,0,0
4,260762184,367322161,1,0,0
...,...,...,...,...,...
86134,224120056,814004012,1,0,0
86135,224120056,821004817,1,0,0
86136,224120056,120942437,1,0,0
86137,224120056,482835606,1,0,0


In [30]:
with open('../../nodes.json', 'r', encoding='utf-8') as f:
    json_file = json.load(f)

friendship = pd.DataFrame(json_file['edges']).drop(columns=[2]).rename(columns={0: 'user_id', 1: 'friend_id'})

In [31]:
friendship = friendship.dropna()
friendship = friendship.drop_duplicates(subset=['user_id', 'friend_id'])

In [32]:
friendship

Unnamed: 0,user_id,friend_id
0,288357777,55846307
2,55846307,210288
3,55846307,278285
4,55846307,444271
5,55846307,586269
...,...,...
19488,189244363,20232183
19489,189244363,34613926
19490,189244363,35951655
19491,189244363,30414089


In [33]:
friendship_set = set(friendship.apply(lambda row: frozenset([row['user_id'], row['friend_id']]), axis=1))


def is_friend_pair(row):
    pair = frozenset([row['target_user_id'], row['interactor_id']])
    return pair in friendship_set


edges['is_friend'] = edges.apply(is_friend_pair, axis=1)

In [35]:
edges['is_friend'].value_counts()

is_friend
False    85884
True       255
Name: count, dtype: int64

In [199]:
edges = pd.read_csv('../../edges.csv')
sampled_edges = edges.drop_duplicates(subset='target_user_id')
sampled_edges = sampled_edges.sample(n=100)
set_ids = set(sampled_edges['target_user_id']) | set(sampled_edges['interactor_id'])
len(set_ids)

186

In [37]:
nodes[nodes['id'] == 288357777]

Unnamed: 0,id,first_name,last_name,domain,photo_file,photo_url,meta.city.id,meta.city.title,meta.can_see_audio,meta.interests,...,meta.status_audio.owner_id,meta.status_audio.title,meta.status_audio.duration,meta.status_audio.is_explicit,meta.status_audio.is_focus_track,meta.status_audio.track_code,meta.status_audio.url,meta.status_audio.stream_duration,meta.status_audio.date,meta.status_audio.genre_id
4,288357777,Vasily,Volkov,id288357777,,https://sun1-24.userapi.com/s/v1/ig1/y6Dnjgjvq...,1032.0,Polotsk,1.0,,...,,,,,,,,,,


In [45]:
edges.iloc[100]

target_user_id    283301972
interactor_id     336410992
likes_count               1
comments_count            0
reposts_count             0
is_friend             False
Name: 100, dtype: object

In [94]:
edges = edges.drop(columns=['reposts_count'])

In [95]:
edges[edges['is_friend']==True]

Unnamed: 0,target_user_id,interactor_id,likes_count,comments_count,is_friend
129,55846307,1590824,1,1,True
130,55846307,2559871,4,0,True
131,55846307,5123553,1,0,True
133,55846307,6432619,1,0,True
134,55846307,8220590,1,0,True
...,...,...,...,...,...
55180,432805423,202629375,1,0,True
55181,432805423,281302757,1,0,True
55183,432805423,378701994,1,0,True
72920,28978854,432805423,2,0,True


In [160]:
edges[edges['target_user_id']==281302757]

Unnamed: 0,target_user_id,interactor_id,likes_count,comments_count,is_friend


In [124]:
s = len(set(edges['interactor_id'].unique()) & set(edges['target_user_id']))
print(s)

771


In [156]:
edges['comments_count'].value_counts()

comments_count
0      83292
1       2265
2        422
3         76
4         45
5         13
6          8
7          3
12         2
8          2
9          2
36         1
33         1
77         1
328        1
17         1
13         1
64         1
25         1
498        1
Name: count, dtype: int64

In [153]:
nodes[nodes['id'] == 482835606]

Unnamed: 0,id,first_name,last_name,domain,photo_file,photo_url,meta.city.id,meta.city.title,meta.can_see_audio,meta.interests,...,meta.status_audio.owner_id,meta.status_audio.title,meta.status_audio.duration,meta.status_audio.is_explicit,meta.status_audio.is_focus_track,meta.status_audio.track_code,meta.status_audio.url,meta.status_audio.stream_duration,meta.status_audio.date,meta.status_audio.genre_id


In [149]:
friendship[friendship['user_id']==260762184]

Unnamed: 0,user_id,friend_id


In [None]:
for i in edges:
    if not (friendship.loc[i]).any():
    print('+')

+


In [None]:
len(edges['target_user_id'].unique())

77462

In [126]:
nodes['id']

0       179301939
2       327327093
4       288357777
5        55846307
8          210288
          ...    
8346     59192221
8347     59259047
8348     59285071
8349     60031769
8350     60137743
Name: id, Length: 6714, dtype: int64

In [58]:
have_friends = friendship['user_id']

In [69]:
shf = set(have_friends)

In [70]:
len(shf)

120

In [None]:
len(edges['interactor_id'].value_counts())

77462

In [189]:
set(edges['interactor_id']) & set(edges['target_user_id'])

{1186205,
 1939920,
 2096458,
 2529463,
 2888163,
 2992385,
 3336589,
 3654241,
 3822632,
 3993225,
 4183711,
 4777171,
 4796540,
 5123553,
 5267669,
 6432619,
 6505209,
 7571350,
 8030497,
 8045670,
 8292833,
 8853225,
 9068194,
 9923336,
 10059255,
 10085279,
 10428907,
 11728726,
 12038899,
 12756666,
 13976479,
 14040748,
 14499087,
 14634020,
 17192893,
 19440457,
 19694780,
 20214034,
 20699390,
 20708651,
 22330354,
 22935063,
 24014538,
 24980376,
 26520039,
 26661366,
 27110902,
 27295701,
 28978854,
 29341724,
 29594861,
 30028655,
 30078265,
 31781662,
 34490689,
 34675402,
 34777600,
 35729654,
 36572399,
 37176576,
 38402115,
 39371659,
 41616104,
 42118770,
 47094883,
 49531240,
 49672579,
 50486094,
 50706057,
 51278027,
 51952587,
 53809248,
 54327597,
 54572101,
 54918623,
 54971617,
 55846307,
 59997518,
 60015021,
 60220605,
 61325544,
 61458119,
 61828309,
 62043573,
 64561543,
 66281180,
 66353461,
 66742299,
 68816480,
 70152728,
 72908141,
 73055924,
 74290648,
 

In [187]:
nodes[nodes['id'].isin((set(edges['interactor_id']) & set(edges['target_user_id'])))]

Unnamed: 0,id,first_name,last_name,domain,photo_file,photo_url,meta.city.id,meta.city.title,meta.can_see_audio,meta.interests,...,meta.status_audio.owner_id,meta.status_audio.title,meta.status_audio.duration,meta.status_audio.is_explicit,meta.status_audio.is_focus_track,meta.status_audio.track_code,meta.status_audio.url,meta.status_audio.stream_duration,meta.status_audio.date,meta.status_audio.genre_id
5,55846307,Pashka,Polyakov,id55846307,,https://sun1-18.userapi.com/s/v1/ig1/xdvoQkERj...,1463.0,Zelenograd,1.0,,...,,,,,,,,,,
15,1186205,Vera,Dorokhova,id1186205,,https://sun1-18.userapi.com/s/v1/ig1/Jks3BOhKq...,1.0,Moscow,1.0,,...,,,,,,,,,,
20,1939920,Kirill,But,id1939920,,https://sun1-83.userapi.com/s/v1/ig2/Iuf_2wY3n...,1.0,Moscow,1.0,,...,,,,,,,,,,
23,2096458,Andrey,Gerasimov,id2096458,,https://sun1-92.userapi.com/s/v1/ig2/yuYiFh951...,1.0,Moscow,0.0,,...,,,,,,,,,,
26,2888163,Diman,Dimanovich,id2888163,,https://sun1-86.userapi.com/s/v1/ig2/IMwAy31Rc...,1.0,Moscow,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2778,221041596,Alyona,Marchenko,id221041596,,https://sun1-24.userapi.com/s/v1/ig1/Kqkn-kNsf...,1.0,Moscow,1.0,,...,,,,,,,,,,
2781,221555175,Oleg,Ilyukin,ishk.ruslan,,https://sun1-19.userapi.com/s/v1/ig2/nvTYDHeCE...,1767.0,Sibay,1.0,,...,,,,,,,,,,
2782,222280817,Zhanna,Kirina,id222280817,,https://sun1-25.userapi.com/s/v1/ig1/MD8kdG-kE...,1.0,Moscow,1.0,,...,,,,,,,,,,
2783,222690071,Barsukov,Roman,evil_sithe,,https://sun1-96.userapi.com/s/v1/ig2/4ZMdN95R0...,1.0,Moscow,1.0,"Интересует только моя деятельность, читай выше.",...,,,,,,,,,,


In [None]:
nodes

In [None]:
nodes[n]

In [183]:
set(edges['interactor_id']) & set(edges['target_user_id'])

{1186205,
 1939920,
 2096458,
 2529463,
 2888163,
 2992385,
 3336589,
 3654241,
 3822632,
 3993225,
 4183711,
 4777171,
 4796540,
 5123553,
 5267669,
 6432619,
 6505209,
 7571350,
 8030497,
 8045670,
 8292833,
 8853225,
 9068194,
 9923336,
 10059255,
 10085279,
 10428907,
 11728726,
 12038899,
 12756666,
 13976479,
 14040748,
 14499087,
 14634020,
 17192893,
 19440457,
 19694780,
 20214034,
 20699390,
 20708651,
 22330354,
 22935063,
 24014538,
 24980376,
 26520039,
 26661366,
 27110902,
 27295701,
 28978854,
 29341724,
 29594861,
 30028655,
 30078265,
 31781662,
 34490689,
 34675402,
 34777600,
 35729654,
 36572399,
 37176576,
 38402115,
 39371659,
 41616104,
 42118770,
 47094883,
 49531240,
 49672579,
 50486094,
 50706057,
 51278027,
 51952587,
 53809248,
 54327597,
 54572101,
 54918623,
 54971617,
 55846307,
 59997518,
 60015021,
 60220605,
 61325544,
 61458119,
 61828309,
 62043573,
 64561543,
 66281180,
 66353461,
 66742299,
 68816480,
 70152728,
 72908141,
 73055924,
 74290648,
 

In [163]:
len(edges['target_user_id'].value_counts())

1052

In [166]:
sampled_edges = edges.drop_duplicates(subset='target_user_id')

In [190]:
sampled_edges = sampled_edges.sample(n=100)

In [191]:
sampled_edges

Unnamed: 0,target_user_id,interactor_id,likes_count,comments_count,is_friend
71502,17768893,247433459,5,0,False
25985,552225130,213349976,1,0,False
53595,766763133,809734803,1,0,False
40586,376935387,222369716,1,0,False
37368,743889968,417104642,1,0,False
...,...,...,...,...,...
84511,205252464,29693732,5,0,False
36276,663883570,169023084,4,0,False
46652,114449243,33148155,1,0,False
82252,197077057,143540690,2,0,False


In [192]:
set_ids = set(sampled_edges['target_user_id']) | set(sampled_edges['interactor_id'])

In [193]:
print(len(set_ids))

193


In [66]:
nodes_without_friends = nodes[~nodes['id'].isin(shf)]

In [76]:
nodes_without_friends.to_csv('1.csv')

In [84]:
friendship[friendship['user_id'].isin(friendship['friend_id'])]['user_id'].unique()

array([288357777,  55846307, 592630078, 332323773, 425554768, 496597108,
       360981711, 382089357, 355512739, 371404462, 754110083, 460132198,
       432805423, 141263294, 137953186, 272332784, 324794954, 454723886,
       849853976, 395926934, 404284509, 464651141, 375431034, 231957672,
       371235977,  32139041,  13212791, 345684715, 335012059, 586593132,
       583602971, 389763287, 375532131,  54966692, 394460318, 395474310,
       197833289, 582564854, 723206013, 848843702, 380221737, 374539357,
       746055142, 593319553, 286603903, 159700076,  96903539,  41191971,
       177195982, 164560631, 173818302, 152247182,  20371699, 153872531,
       136493605, 122570289, 137723804, 237805392, 463419391, 420567656,
       218946760, 170307138, 454321589, 343063348, 744967389, 782036426,
       382025708, 541164978, 539436914,  28958461,  28782936, 431838326,
       431524100, 317661612, 314814303, 815348202, 188697522, 455559973,
         1890598])