In [166]:
import requests      
import pandas as pd  
import numpy as np   
import json          
import random as rnd
from time import sleep
import tqdm
from collections import defaultdict
import networkx as nx

rnd.seed(42)

In [183]:
# valid for 24 hours 
token = "<your VK API token>"

In [15]:
# Generate user links
# initial 1000 users 
# it's not actually 1000
initial_1000_ids = [str(rnd.randint(10000000, 1000000000)) for _ in range(10000)]
initial_1000_ids

['793543540',
 '503303705',
 '585770529',
 '144031070',
 '416448196',
 '94611066',
 '602749116',
 '324797776',
 '900566476',
 '684996843',
 '674130526',
 '960746571',
 '935250736',
 '398302652',
 '629927151',
 '216468299',
 '766528252',
 '84684276',
 '59203558',
 '720026086',
 '254703907',
 '840075810',
 '320727955',
 '95675980',
 '928390409',
 '259957310',
 '940379756',
 '118449460',
 '418157429',
 '308471886',
 '496845604',
 '692560971',
 '905619255',
 '401735568',
 '184648506',
 '407478786',
 '391469012',
 '234956459',
 '729595113',
 '296665249',
 '763573823',
 '743953718',
 '705822698',
 '86667861',
 '664049436',
 '691802744',
 '193758720',
 '583528321',
 '792893941',
 '272863730',
 '185452091',
 '506348124',
 '417437181',
 '299854268',
 '697194506',
 '748908273',
 '608020238',
 '245809993',
 '745098955',
 '358195935',
 '915005353',
 '834970419',
 '843223566',
 '70062915',
 '255938494',
 '892403818',
 '44467368',
 '874411347',
 '348715135',
 '440747414',
 '297484583',
 '81069472',


In [4]:
# Get data of the first one thousand users
def get_user_data(user_id: str, token=token) -> pd.DataFrame:

    """This function takes in user_id (str), 
    scrapes VK profile of the corresponding user, 
    and returns a pandas.DataFrame with all the availible info
    
    Args:
        :user_id: str: VK id of a user's profile who's data is being scraped;
        :token: str: access token to VK's API, it is set by default
        
    Returns:
        :df :pandas.DataFrame: a pandas DataFrame that contains all of the infromation \
        about user's page"""
    
    parameters = "fields=sex,about,screen_name,bdate,city,country,home_town,career,education,followers_count,schools,verified,games,interests,military,music,occupation,nickname,quotes,relationmti,ezone,tv,universities"
    url = "https://api.vk.com/method/users.get?" + parameters + "&v=5.199&access_token=" + token + "&user_id=" + user_id
    response = requests.get(url)
    user_data = response.json()

    df = pd.DataFrame.from_dict(user_data["response"][0], orient='index').transpose()
    
    return df

In [11]:
def get_friends_of_initial_1000(user_id: str, token=token) -> pd.DataFrame:

    """This function takes in user_id (str), 
    scrapes VK profiles of 12 friends of the corresponding user, 
    and returns a pandas.DataFrame with all the availible info
    
    Args:
        :user_id: str: VK id of a user's profile who's friend's profiles are to be scraped;
        :token: str: access token to VK's API, it is set by default
        
    Returns:
        :df :pandas.DataFrame: a pandas DataFrame that contains all of the infromation \
        about user's friend's pages"""
    
    parameters = "fields=sex,about,screen_name,bdate,city,country,home_town,career,education,followers_count,schools,verified,games,interests,military,music,occupation,nickname,quotes,relationmti,ezone,tv,universities"
    url = "https://api.vk.com/method/friends.get?" + parameters + "&v=5.199&access_token=" + token + "&user_id=" + user_id
    response = requests.get(url)
    user_data = response.json()
    d = user_data["response"]["items"]

    
    friends = rnd.sample(d, k = 25 if len(d) >= 25 else len(d))

    df = pd.DataFrame()
    
    for i in range(0, 25 if len(d) >= 25 else len(d)):
        friends[i]["friend_of"] = user_id
        df = pd.concat([df, pd.DataFrame.from_dict(friends[i], orient='index').transpose()], ignore_index=True)

    return df 

In [17]:
df_of_gen_users = pd.DataFrame()
df_of_friends_of_gen_users = pd.DataFrame() 
error_list = []

for i in tqdm.tqdm(initial_1000_ids):
    try:
        sub_df = get_friends_of_initial_1000(i)
        df_of_gen_users = pd.concat([df_of_gen_users, get_user_data(i)], ignore_index=True)
        sleep(0.33)
        df_of_friends_of_gen_users = pd.concat([df_of_friends_of_gen_users, sub_df], ignore_index=True)
    except:
        error_list.append(i)
        
df_of_gen_users.to_excel("df_of_gen_users.xlsx")
df_of_friends_of_gen_users.to_excel("df_of_friends_of_gen_users.xlsx")

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [6:54:56<00:00,  2.49s/it]


In [26]:
error_list

['229068042',
 '164263081',
 '935012552',
 '352863363',
 '934252660',
 '438118701',
 '150409084',
 '269564054',
 '510871106',
 '297195524',
 '113950725',
 '64337019',
 '321612760',
 '937888868',
 '458968760',
 '939116851',
 '978848875',
 '883465185',
 '630286733',
 '181092615',
 '953900305',
 '34885365',
 '432240582',
 '552716859',
 '268480413',
 '636360971',
 '64267347',
 '957404799',
 '915002919',
 '712506364',
 '732113500',
 '18780419',
 '919644407',
 '153811545',
 '957529631',
 '791155839',
 '830184955',
 '442990767',
 '613246271',
 '399256938',
 '604018597',
 '88954113',
 '424838889',
 '551440255',
 '493564328',
 '830772671',
 '896171278',
 '740572478',
 '137572362',
 '148151258',
 '863514394',
 '402566853',
 '820637681',
 '164905897',
 '58644109',
 '776213410',
 '518678910',
 '968992851',
 '160088694',
 '914736933',
 '331300425',
 '371178244',
 '899368954',
 '557301347',
 '581595541',
 '614435323',
 '519912042',
 '885903641',
 '405521185',
 '733494111',
 '340366556',
 '975043826'

In [153]:
df_of_gen_users_excel = pd.read_excel("df_of_gen_users.xlsx")
df_of_friends_of_gen_users_excel = pd.read_excel("df_of_friends_of_gen_users.xlsx")

In [154]:
def str_to_dict_ti_pd(str_line: str) -> list[dict]:
    
    future_dict = [i.split(", ") for i in str_line.replace("'", "").replace("{", "").replace("[", "").replace("]", "").split("}, ")]

    default_dict = defaultdict(list)

    for dictionary in [dict([tuple(j.split(": ")) for j in i]) for i in future_dict]:
        for k, v in dictionary.items():
            default_dict[k].append(v)

    return default_dict


def pseudo_lambda(x):
    try:
        if x != "NaN" and x != "[]" and x != []:
            return pd.json_normalize(str_to_dict_ti_pd(x))
        else: 
            return pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    except:
        return pd.DataFrame({"dummy_col": "NaN"}, index=[0])

In [162]:
def df_processor(excel_df: pd.DataFrame) -> pd.DataFrame:

    """
    This function takes in an unprocessed pd.DataFrame that was extracted from an Excel file,
    adds "has_quote", "has_about", "has_games", "has_music", "has_university" columns, 
    replaces empty lists, empty strings, "nan", np.nan for "NaN" in "university", "faculty", "graduation", "universities" columns,
    unpacks "universities", "schools", "military", "occupation", "city" columns,
    and returns the resulting pd.DataFrame
    """

    error_list_ = ["NaN", "nan", np.nan, "", []]
    
    # adding "has_quote", "has_about", "has_games", "has_music", "has_university" columns
    # replacing empty lists, empty strings, "nan", np.nan for "NaN" in "university", "faculty", "graduation", "universities" columns
    excel_df["has_quote"] = excel_df["quotes"].apply(lambda x: 0 if x in error_list_ else 1)
    excel_df["has_about"] = excel_df["about"].apply(lambda x: 0 if x in error_list_ else 1)
    excel_df["has_games"] = excel_df["games"].apply(lambda x: 0 if x in error_list_ else 1)
    excel_df["has_music"] = excel_df["music"].apply(lambda x: 0 if x in error_list_ else 1)
    excel_df["has_university"] = excel_df["university"].apply(lambda x: 0 if x in error_list_ + [0] else 1)
    excel_df["university"] = excel_df["university"].apply(lambda x: "NaN" if x == 0 else x)
    excel_df["faculty"] = excel_df["faculty"].apply(lambda x: "NaN" if x == 0 else x)
    excel_df["graduation"] = excel_df["graduation"].apply(lambda x: "NaN" if x == 0 else x)
    excel_df["universities"] = excel_df["universities"].apply(lambda x: "NaN" if x in error_list_ or x == [] else x) 
    
    # unpacking "universities" column
    dummy_df = pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    
    for i in tqdm.tqdm(excel_df["universities"], desc="universities_col"):
        dummy_df = pd.concat([dummy_df, pseudo_lambda(i)], ignore_index=True)
    
    excel_df = excel_df.join(dummy_df.iloc[1:].drop(["dummy_col"], axis=1), rsuffix="_unis").drop(["universities"], axis=1)
    
    # unpacking "schools" column
    dummy_df = pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    
    for i in tqdm.tqdm(excel_df["schools"], desc="schools_uni"):
        dummy_df = pd.concat([dummy_df, pseudo_lambda(i)], ignore_index=True)
    
    excel_df = excel_df.join(dummy_df.iloc[1:].drop(["dummy_col"], axis=1), rsuffix="_schools").drop(["schools"], axis=1)
    
    # unpacking "military" column
    dummy_df = pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    
    for i in tqdm.tqdm(excel_df["military"], desc="military_col"):
        dummy_df = pd.concat([dummy_df, pseudo_lambda(i)], ignore_index=True)
    
    excel_df = excel_df.join(dummy_df.iloc[1:].drop(["dummy_col"], axis=1), rsuffix="_military").drop(["military"], axis=1)
    
    # unpacking "occupation" column
    dummy_df = pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    
    for i in tqdm.tqdm(excel_df["occupation"], desc="occupation_col"):
        dummy_df = pd.concat([dummy_df, pseudo_lambda(i)], ignore_index=True)
    
    excel_df = excel_df.join(dummy_df.iloc[1:].drop(["dummy_col"], axis=1), rsuffix="_occupation").drop(["occupation"], axis=1)
    
    # unpacking "city" column
    dummy_df = pd.DataFrame({"dummy_col": "NaN"}, index=[0])
    
    for i in tqdm.tqdm(excel_df["city"], desc="city_col"):
        dummy_df = pd.concat([dummy_df, pseudo_lambda(i)], ignore_index=True)
    
    excel_df = excel_df.join(dummy_df.iloc[1:].drop(["dummy_col"], axis=1), rsuffix="_city").drop(["city"], axis=1)
    
    # drops "Unnamed: 0" column
    excel_df = excel_df.drop(["Unnamed: 0"], axis=1)


    return excel_df

In [171]:
df_orig_users = df_processor(df_of_gen_users_excel)

universities_col: 100%|███████████████████████████████████████████████████████████| 3822/3822 [00:04<00:00, 774.26it/s]
schools_uni: 100%|████████████████████████████████████████████████████████████████| 3822/3822 [00:04<00:00, 820.16it/s]
military_col: 100%|██████████████████████████████████████████████████████████████| 3822/3822 [00:02<00:00, 1520.94it/s]
occupation_col: 100%|████████████████████████████████████████████████████████████| 3822/3822 [00:03<00:00, 1253.24it/s]
city_col: 100%|██████████████████████████████████████████████████████████████████| 3822/3822 [00:02<00:00, 1749.67it/s]


In [169]:
df_fr_of_orig_users = df_processor(df_of_friends_of_gen_users_excel)

universities_col: 100%|█████████████████████████████████████████████████████████| 21194/21194 [00:35<00:00, 595.16it/s]
schools_uni: 100%|██████████████████████████████████████████████████████████████| 21194/21194 [00:34<00:00, 609.62it/s]
military_col: 100%|████████████████████████████████████████████████████████████| 21194/21194 [00:12<00:00, 1710.33it/s]
occupation_col: 100%|███████████████████████████████████████████████████████████| 21194/21194 [00:23<00:00, 904.78it/s]
city_col: 100%|████████████████████████████████████████████████████████████████| 21194/21194 [00:15<00:00, 1382.08it/s]


In [172]:
df_orig_users.to_excel("df_of_gen_users_processed.xlsx")
df_fr_of_orig_users.to_excel("df_of_friends_of_gen_users_processed.xlsx")

In [174]:
df_graph_of_vk_users = df_orig_users.merge(
    df_fr_of_orig_users, 
    how="right",
    right_on="friend_of",
    left_on="id",
    suffixes=["_user_y", "_user_x"]
) #friend_of

In [180]:
df_graph_of_vk_users.to_excel("df_graph_of_vk_users.xlsx")

In [181]:
G = nx.from_pandas_edgelist(df_graph_of_vk_users, "id_user_x", "id_user_y")

In [182]:
nx.write_gexf(G, "nx_graph_of_vk_users.gexf")