***
# **Required Variable Settings** 

In [10]:
subreddit = "Ask_Politics"

pos_mean_pb_threshold = 0.75
neg_mean_pb_treshold = 0.25
var_threshold = 0.025


model = "Llama_3.3_70B_Instruct_scale"


# define optional sample fraction 

sample_boolean = False
sample_fraction = 0.1

num_groups = 2

# adapt file name for csv saving
import_path = f"output/{model}/{subreddit}_{model}_filtered_relations.csv"
output_path = f"output/{model}/optimization_data_{subreddit}_{model}_frac1.csv"


***

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import json
import matplotlib.pyplot as plt
import seaborn as sns
import community as community_louvain

from sklearn.model_selection import train_test_split
from scipy import sparse
import random
from itertools import combinations
import collections
import multiprocessing
import copy
import time
import gurobipy
from gurobipy import *

In [4]:
# # import self written functions
# from functions.script_faultana_functions import filter_subreddit
# from functions.script_faultana_functions import aggregate_interactions_directed
# from functions.script_faultana_functions import get_counts_and_shares
# from functions.script_faultana_functions import find_max
# from functions.script_faultana_functions import build_undirected


In [4]:
# import prepared data for optimization
data_for_optim = pd.read_csv(import_path)
data_for_optim

Unnamed: 0,user_1,user_2,neg,neu,pos,interact,final_edge,final_edge_num,direction,neg_adj,pos_adj,interact_adj,mean_pb,variance,old_var,edge_adj,user_1_id,user_2_id
0,-ThePhallus-,sleep-apnea,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,g8jh6lh,i4kbvbu
1,02C_here,AdvocateReason,0.0,0,3.0,3.0,pos,1,bi,0.0,3.0,3.0,1.0,0.0,0.0,1.0,fsqbss8,fwkxx37
2,2343252621,bassadorable,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,gt3s16g,gra3nqc
3,2A_is_the_best_A,Dennis_Langley,4.0,0,0.0,4.0,neg,-1,bi,4.0,0.0,4.0,0.0,0.0,0.0,-1.0,f1fpzvz,g78czp4
4,2A_is_the_best_A,EbilSmurfs,2.0,0,0.0,2.0,neg,-1,bi,2.0,0.0,2.0,0.0,0.0,0.0,-1.0,f1fpzvz,dce4hul
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44087,zvika,eecity,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,fzdljst,fwl0qku
44088,zyadcvnhgb,CoolLikeAFoolinaPool,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsd5v
44089,zyadcvnhgb,Steavee,1.0,0,0.0,1.0,neg,-1,uni,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,d6zd2tg,fkqlew5
44090,zyadcvnhgb,cruisintom,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsaor


### Make data

In [5]:
# whether to sample or not, defined in the beginning

if sample_boolean:

    # Perform a stratified split
    net_sample, _ = train_test_split(
        data_for_optim[(data_for_optim['edge_adj'] == 1) | (data_for_optim['edge_adj'] == -1)],
        test_size = 1 - sample_fraction,
        stratify = data_for_optim[(data_for_optim['edge_adj'] == 1) | (data_for_optim['edge_adj'] == -1)]['edge_adj'],
        random_state=42
    )

else: # take whole df
    net_sample = data_for_optim

In [6]:
net_sample

Unnamed: 0,user_1,user_2,neg,neu,pos,interact,final_edge,final_edge_num,direction,neg_adj,pos_adj,interact_adj,mean_pb,variance,old_var,edge_adj,user_1_id,user_2_id
0,-ThePhallus-,sleep-apnea,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,g8jh6lh,i4kbvbu
1,02C_here,AdvocateReason,0.0,0,3.0,3.0,pos,1,bi,0.0,3.0,3.0,1.0,0.0,0.0,1.0,fsqbss8,fwkxx37
2,2343252621,bassadorable,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,gt3s16g,gra3nqc
3,2A_is_the_best_A,Dennis_Langley,4.0,0,0.0,4.0,neg,-1,bi,4.0,0.0,4.0,0.0,0.0,0.0,-1.0,f1fpzvz,g78czp4
4,2A_is_the_best_A,EbilSmurfs,2.0,0,0.0,2.0,neg,-1,bi,2.0,0.0,2.0,0.0,0.0,0.0,-1.0,f1fpzvz,dce4hul
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44087,zvika,eecity,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,fzdljst,fwl0qku
44088,zyadcvnhgb,CoolLikeAFoolinaPool,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsd5v
44089,zyadcvnhgb,Steavee,1.0,0,0.0,1.0,neg,-1,uni,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,d6zd2tg,fkqlew5
44090,zyadcvnhgb,cruisintom,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsaor


In [7]:
# extract users of this sample --> could be a sub-sample

user_sample = set(list(net_sample['user_1']) + list(net_sample['user_2']))

# then assign ids
ids = list(range(len(user_sample)))
name_to_optimization_id_dict = dict(zip(user_sample, ids))

In [8]:
# assign id to each user
net_sample['user_1_optimization_id'] = [name_to_optimization_id_dict[user] for user in net_sample['user_1']]
net_sample['user_2_optimization_id'] = [name_to_optimization_id_dict[user] for user in net_sample['user_2']]
net_sample

Unnamed: 0,user_1,user_2,neg,neu,pos,interact,final_edge,final_edge_num,direction,neg_adj,pos_adj,interact_adj,mean_pb,variance,old_var,edge_adj,user_1_id,user_2_id,user_1_optimization_id,user_2_optimization_id
0,-ThePhallus-,sleep-apnea,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,g8jh6lh,i4kbvbu,8611,10200
1,02C_here,AdvocateReason,0.0,0,3.0,3.0,pos,1,bi,0.0,3.0,3.0,1.0,0.0,0.0,1.0,fsqbss8,fwkxx37,6880,15186
2,2343252621,bassadorable,0.0,0,2.0,2.0,pos,1,bi,0.0,2.0,2.0,1.0,0.0,0.0,1.0,gt3s16g,gra3nqc,12419,13478
3,2A_is_the_best_A,Dennis_Langley,4.0,0,0.0,4.0,neg,-1,bi,4.0,0.0,4.0,0.0,0.0,0.0,-1.0,f1fpzvz,g78czp4,6822,3065
4,2A_is_the_best_A,EbilSmurfs,2.0,0,0.0,2.0,neg,-1,bi,2.0,0.0,2.0,0.0,0.0,0.0,-1.0,f1fpzvz,dce4hul,6822,4197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44087,zvika,eecity,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,fzdljst,fwl0qku,10121,705
44088,zyadcvnhgb,CoolLikeAFoolinaPool,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsd5v,17489,14209
44089,zyadcvnhgb,Steavee,1.0,0,0.0,1.0,neg,-1,uni,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,d6zd2tg,fkqlew5,17489,10115
44090,zyadcvnhgb,cruisintom,0.0,0,1.0,1.0,pos,1,uni,0.0,1.0,1.0,1.0,0.0,0.0,1.0,d6zd2tg,ctfsaor,17489,1104


In [9]:
net_sample = net_sample[['user_1_optimization_id', 'user_2_optimization_id', 'user_1_id', 'user_2_id', 'final_edge_num']].reset_index(drop = True)
net_sample.rename(columns={'user_1_optimization_id' : 'source', 'user_2_optimization_id' : 'target', 'final_edge_num' : 'sign'}, inplace=True)
net_sample

Unnamed: 0,source,target,user_1_id,user_2_id,sign
0,8611,10200,g8jh6lh,i4kbvbu,1
1,6880,15186,fsqbss8,fwkxx37,1
2,12419,13478,gt3s16g,gra3nqc,1
3,6822,3065,f1fpzvz,g78czp4,-1
4,6822,4197,f1fpzvz,dce4hul,-1
...,...,...,...,...,...
44087,10121,705,fzdljst,fwl0qku,1
44088,17489,14209,d6zd2tg,ctfsd5v,1
44089,17489,10115,d6zd2tg,fkqlew5,-1
44090,17489,1104,d6zd2tg,ctfsaor,1


In [11]:
# save: name defined in beginning of notebook

net_sample.to_csv(output_path, index = False)

In [12]:
net_sample

Unnamed: 0,source,target,user_1_id,user_2_id,sign
0,8611,10200,g8jh6lh,i4kbvbu,1
1,6880,15186,fsqbss8,fwkxx37,1
2,12419,13478,gt3s16g,gra3nqc,1
3,6822,3065,f1fpzvz,g78czp4,-1
4,6822,4197,f1fpzvz,dce4hul,-1
...,...,...,...,...,...
44087,10121,705,fzdljst,fwl0qku,1
44088,17489,14209,d6zd2tg,ctfsd5v,1
44089,17489,10115,d6zd2tg,fkqlew5,-1
44090,17489,1104,d6zd2tg,ctfsaor,1
