# **Make final analysis data**

In [1]:
import pandas as pd

In [2]:
subreddits = ['Ask_Politics' , 'Askpolitics', 'PoliticalDiscussion', 'PoliticalDebate', 'NeutralPolitics','politics']

subreddits_reduced = ['Ask_Politics' , 'Askpolitics', 'PoliticalDiscussion', 'PoliticalDebate', 'NeutralPolitics']


In [3]:
# get the final labels

subr_labels = {}

for subr in subreddits_reduced:
    subr_labels[subr] = pd.read_csv(f"../../../data/classified_labels/incl_true_probability/{subr}_Llama_3.1_8B_ft_classified.csv")

subr_labels["Ask_Politics"]

Unnamed: 0,msg_id_parent,msg_id_child,submission_id,predictions_label_ft,predictions_score_ft,predictions_prob_disagree,predictions_ft
0,c7ecp8d,c7ecs2n,14m325,0,0.6180,0.38180,no_disagreement
1,c7ecp8d,c7ecxpk,14m325,0,0.7134,0.28640,no_disagreement
2,c7edr13,c7edsp8,14m26x,0,0.8633,0.13650,no_disagreement
3,c7ee45j,c7ee4yl,14m26x,0,0.9126,0.08750,no_disagreement
4,c7ee45j,c7ee63a,14m26x,0,0.9310,0.06885,no_disagreement
...,...,...,...,...,...,...,...
53420,eildsfz,kcank7h,b1fvxd,1,0.9850,0.98500,disagree
53421,ffre1be,kcpthf9,eufkoe,1,0.6510,0.65100,disagree
53422,jqjjy5q,kd3bh9v,14ot71b,0,0.9800,0.02010,no_disagreement
53423,eh3fmh4,kd4b3a5,attjcv,1,0.8403,0.84030,disagree


In [4]:
# politics missing labels

subr_labels["politics"] = pd.DataFrame()

for i in range(6):
    part = pd.read_csv(f"../../../data/classified_labels/politics_{i}_Llama_3.1_8B_ft_classified.csv")
    subr_labels["politics"] = pd.concat([subr_labels["politics"], part]).reset_index(drop = True)

subr_labels["politics"] 


Unnamed: 0,msg_id_parent,msg_id_child,submission_id,predictions_label_ft,predictions_score_ft,predictions_prob_disagree,predictions_ft
0,c0ae9r5,c0aeb6y,8tp50,0,0.5230,0.52300,no_disagreement
1,c0aeds9,c0aee4d,8tp50,0,0.8896,0.11050,no_disagreement
2,c0aeds9,c0aeglj,8tp50,1,0.7925,0.79250,disagree
3,c0aeds9,c0aehqw,8tp50,1,0.9850,0.98500,disagree
4,c0aecro,c0aegcb,8tp50,1,0.9727,0.97270,disagree
...,...,...,...,...,...,...,...
4168047,kfkh5ib,kfmd52k,18uirzt,0,0.9550,0.04495,no_disagreement
4168048,kfklsb6,kforzbx,18uirzt,0,0.9920,0.00774,no_disagreement
4168049,kfmq5kr,kfp6tzv,18uirzt,1,0.8500,0.85000,disagree
4168050,kfp3d4v,kfp8xh8,18uirzt,0,0.9414,0.05875,no_disagreement


# **Clipped**

In [5]:
# get EMI scores (they have labels, but not the final ones, instead those of 512 tokens)

subr_dict_clipped = {}

for subr in subreddits:
    subr_dict_clipped[subr] = pd.read_csv(f"output_clipped/{subr}_EMI.csv")

    # drop labels
    subr_dict_clipped[subr] = subr_dict_clipped[subr].drop(columns = ["predictions_score_ft", "predictions_ft"])
    

In [6]:
for subr in subreddits:

    print(subr)
    print("Tokens", len(subr_dict_clipped[subr]))
    print("Labels", len(subr_labels[subr]))
    print("-----")

Ask_Politics
Tokens 54203
Labels 53425
-----
Askpolitics
Tokens 6354
Labels 6327
-----
PoliticalDiscussion
Tokens 667335
Labels 663806
-----
PoliticalDebate
Tokens 8738
Labels 8649
-----
NeutralPolitics
Tokens 45221
Labels 43015
-----
politics
Tokens 4171700
Labels 4168052
-----


**Interpretation**

* Classification with token limit 1024 lead to fewer labels, than I have EMI scores for $\to$ retain classified labels


## **Merge the final labels**

In [7]:
final_subr_dict_clipped = {}

for subr in subreddits:
    
    final_subr_dict_clipped[subr] = subr_dict_clipped[subr].merge(subr_labels[subr][['msg_id_parent', "msg_id_child", "submission_id", 
                                                                                     "predictions_label_ft", "predictions_score_ft", "predictions_prob_disagree", "predictions_ft"]],
                                            left_on = ['msg_id_parent', "msg_id_child", "submission_id"], 
                                            right_on = ['msg_id_parent', "msg_id_child", "submission_id"],
                                            how = "inner")


In [8]:
for subr in subreddits:

    print(subr)
    print("Tokens", len(subr_dict_clipped[subr]))
    print("Labels", len(subr_labels[subr]))
    print("Final", len(final_subr_dict_clipped[subr]))
    print("-----")

Ask_Politics
Tokens 54203
Labels 53425
Final 53366
-----
Askpolitics
Tokens 6354
Labels 6327
Final 6319
-----
PoliticalDiscussion
Tokens 667335
Labels 663806
Final 663008
-----
PoliticalDebate
Tokens 8738
Labels 8649
Final 8646
-----
NeutralPolitics
Tokens 45221
Labels 43015
Final 43006
-----
politics
Tokens 4171700
Labels 4168052
Final 4164459
-----


In [9]:
df_all_clipped = pd.DataFrame()

for subr in subreddits:
    df_all_clipped = pd.concat([df_all_clipped, final_subr_dict_clipped[subr]]).reset_index(drop = True)

df_all_clipped

Unnamed: 0,msg_id_parent,msg_id_child,submission_id,subreddit,exact_time_child,author_child,author_parent,author_submission,finetuned_child_E_norm,finetuned_child_I_norm,...,finetuned_parent_len_norm_z_trans_EMI,finetuned_child_len_norm_z_trans_EMI,finetuned_submission_len_norm_z_trans_EMI,self_parent_len_norm_z_trans_EMI,self_child_len_norm_z_trans_EMI,self_submission_len_norm_z_trans_EMI,predictions_label_ft,predictions_score_ft,predictions_prob_disagree,predictions_ft
0,c7ecp8d,c7ecs2n,14m325,Ask_Politics,1355166551.0,karmanaut,Ramblin_Dash,karmanaut,0.056203,0.047469,...,0.635128,0.111758,0.101339,1.240049,0.499480,-0.135574,0,0.6180,0.38180,no_disagreement
1,c7ecp8d,c7ecxpk,14m325,Ask_Politics,1355167049.0,zoolander951,Ramblin_Dash,karmanaut,0.084537,0.065886,...,0.635128,0.224463,0.101339,1.240049,0.861832,-0.135574,0,0.7134,0.28640,no_disagreement
2,c7ecp8d,c7eflag,14m325,Ask_Politics,1355175187.0,zossima,Ramblin_Dash,karmanaut,0.038321,-0.036267,...,0.635128,0.777773,0.101339,1.240049,1.104092,-0.135574,0,0.6580,0.34180,no_disagreement
3,c7efx8q,c7ejh1d,14m325,Ask_Politics,1355187644.0,fathermocker,senatorskeletor,karmanaut,0.028981,-0.000700,...,0.234619,0.314984,0.101339,1.187509,0.728544,-0.135574,1,0.9507,0.95070,disagree
4,c7efx8q,c7f8140,14m325,Ask_Politics,1355286254.0,PKMKII,senatorskeletor,karmanaut,0.018974,-0.067049,...,0.234619,0.886919,0.101339,1.187509,1.356398,-0.135574,0,0.8580,0.14180,no_disagreement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4938799,kfkh5ib,kfmd52k,18uirzt,politics,1970-01-01 00:00:01.703977996,Knightforlife,freddiethebaer,optimalg,0.249565,0.178504,...,3.112414,0.819523,-0.065721,2.081093,0.213635,-0.224799,0,0.9550,0.04495,no_disagreement
4938800,kfklsb6,kforzbx,18uirzt,politics,1970-01-01 00:00:01.704023024,bakerfredricka,4blockhead,optimalg,0.152297,0.168448,...,1.810992,-0.080931,-0.065721,1.475308,0.316076,-0.224799,0,0.9920,0.00774,no_disagreement
4938801,kfmq5kr,kfp6tzv,18uirzt,politics,1970-01-01 00:00:01.704032059,glassjar1,DoubleTFan,optimalg,0.059733,0.064333,...,0.599825,-0.014813,-0.065721,,-0.405775,-0.224799,1,0.8500,0.85000,disagree
4938802,kfp3d4v,kfp8xh8,18uirzt,politics,1970-01-01 00:00:01.704033069,HaulinBoats,4blockhead,optimalg,0.111970,0.181527,...,-0.241116,-0.622660,-0.065721,0.968095,-0.515354,-0.224799,0,0.9414,0.05875,no_disagreement


In [10]:
df_all_clipped.to_csv("../../../data/analysis_data/df_all_clipped.csv", index = False)

In [11]:
# reimport test

df_all_clipped_reimport = pd.read_csv("../../../data/analysis_data/df_all_clipped.csv")
df_all_clipped_reimport

  df_all_clipped_reimport = pd.read_csv("../../../data/analysis_data/df_all_clipped.csv")


Unnamed: 0,msg_id_parent,msg_id_child,submission_id,subreddit,exact_time_child,author_child,author_parent,author_submission,finetuned_child_E_norm,finetuned_child_I_norm,...,self_child_len_EMI_z_trans,self_submission_len_EMI_z_trans,finetuned_parent_len_norm_z_trans_EMI,finetuned_child_len_norm_z_trans_EMI,finetuned_submission_len_norm_z_trans_EMI,self_parent_len_norm_z_trans_EMI,self_child_len_norm_z_trans_EMI,self_submission_len_norm_z_trans_EMI,predictions_score_ft,predictions_ft
0,c7ecp8d,c7ecs2n,14m325,Ask_Politics,1355166551.0,karmanaut,Ramblin_Dash,karmanaut,0.056203,0.047469,...,0.665806,-0.154065,0.635128,0.111758,0.101339,1.240049,0.499480,-0.135574,0.6180,no_disagreement
1,c7ecp8d,c7ecxpk,14m325,Ask_Politics,1355167049.0,zoolander951,Ramblin_Dash,karmanaut,0.084537,0.065886,...,1.050497,-0.154065,0.635128,0.224463,0.101339,1.240049,0.861832,-0.135574,0.7134,no_disagreement
2,c7ecp8d,c7eflag,14m325,Ask_Politics,1355175187.0,zossima,Ramblin_Dash,karmanaut,0.038321,-0.036267,...,1.268032,-0.154065,0.635128,0.777773,0.101339,1.240049,1.104092,-0.135574,0.6580,no_disagreement
3,c7efx8q,c7ejh1d,14m325,Ask_Politics,1355187644.0,fathermocker,senatorskeletor,karmanaut,0.028981,-0.000700,...,0.844547,-0.154065,0.234619,0.314984,0.101339,1.187509,0.728544,-0.135574,0.9507,disagree
4,c7efx8q,c7f8140,14m325,Ask_Politics,1355286254.0,PKMKII,senatorskeletor,karmanaut,0.018974,-0.067049,...,1.498303,-0.154065,0.234619,0.886919,0.101339,1.187509,1.356398,-0.135574,0.8580,no_disagreement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4938799,kfkh5ib,kfmd52k,18uirzt,politics,1970-01-01 00:00:01.703977996,Knightforlife,freddiethebaer,optimalg,0.249565,0.178504,...,0.373386,-0.194895,3.112414,0.819523,-0.065721,2.081093,0.213635,-0.224799,0.9550,no_disagreement
4938800,kfklsb6,kforzbx,18uirzt,politics,1970-01-01 00:00:01.704023024,bakerfredricka,4blockhead,optimalg,0.152297,0.168448,...,0.537517,-0.194895,1.810992,-0.080931,-0.065721,1.475308,0.316076,-0.224799,0.9920,no_disagreement
4938801,kfmq5kr,kfp6tzv,18uirzt,politics,1970-01-01 00:00:01.704032059,glassjar1,DoubleTFan,optimalg,0.059733,0.064333,...,-0.298872,-0.194895,0.599825,-0.014813,-0.065721,,-0.405775,-0.224799,0.8500,disagree
4938802,kfp3d4v,kfp8xh8,18uirzt,politics,1970-01-01 00:00:01.704033069,HaulinBoats,4blockhead,optimalg,0.111970,0.181527,...,-0.367522,-0.194895,-0.241116,-0.622660,-0.065721,0.968095,-0.515354,-0.224799,0.9414,no_disagreement


# **Non Clipped**

In [3]:
# get EMI scores (they have labels, but not the final ones, instead those of 512 tokens)

subr_dict_non_clipped = {}

for subr in subreddits:
    subr_dict_non_clipped[subr] = pd.read_csv(f"output_non_clipped/{subr}_EMI.csv")

    # drop labels
    subr_dict_non_clipped[subr] = subr_dict_non_clipped[subr].drop(columns = ["predictions_score_ft", "predictions_ft"])
    

In [12]:
for subr in subreddits:

    print(subr)
    print("Tokens", len(subr_dict_non_clipped[subr]))
    print("Labels", len(subr_labels[subr]))
    print("-----")

Ask_Politics
Tokens 54203
Labels 53425
-----
Askpolitics
Tokens 6354
Labels 6327
-----
PoliticalDiscussion
Tokens 667335
Labels 663806
-----
PoliticalDebate
Tokens 8738
Labels 8649
-----
NeutralPolitics
Tokens 45221
Labels 43015
-----
politics
Tokens 4171700
Labels 4168052
-----


**Interpretation**

* Classification with token limit 1024 lead to fewer labels, than I have EMI scores for $\to$ retain classified labels


## **Merge the final labels**

In [13]:
final_subr_dict_non_clipped = {}

for subr in subreddits:
    
    final_subr_dict_non_clipped[subr] = subr_dict_non_clipped[subr].merge(subr_labels[subr][['msg_id_parent', "msg_id_child", "submission_id", 
                                                                                             "predictions_label_ft", "predictions_score_ft", 
                                                                                             "predictions_prob_disagree", "predictions_ft"]],
                                            left_on = ['msg_id_parent', "msg_id_child", "submission_id"], 
                                            right_on = ['msg_id_parent', "msg_id_child", "submission_id"],
                                            how = "inner")


In [14]:
for subr in subreddits:

    print(subr)
    print("Tokens", len(subr_dict_non_clipped[subr]))
    print("Labels", len(subr_labels[subr]))
    print("Final", len(final_subr_dict_non_clipped[subr]))
    print("-----")

Ask_Politics
Tokens 54203
Labels 53425
Final 53366
-----
Askpolitics
Tokens 6354
Labels 6327
Final 6319
-----
PoliticalDiscussion
Tokens 667335
Labels 663806
Final 663008
-----
PoliticalDebate
Tokens 8738
Labels 8649
Final 8646
-----
NeutralPolitics
Tokens 45221
Labels 43015
Final 43006
-----
politics
Tokens 4171700
Labels 4168052
Final 4164459
-----


In [15]:
df_all_non_clipped = pd.DataFrame()

for subr in subreddits:
    df_all_non_clipped = pd.concat([df_all_non_clipped, final_subr_dict_non_clipped[subr]]).reset_index(drop = True)

df_all_non_clipped

Unnamed: 0,msg_id_parent,msg_id_child,submission_id,subreddit,exact_time_child,author_child,author_parent,author_submission,finetuned_child_E_norm,finetuned_child_I_norm,...,finetuned_parent_len_norm_z_trans_EMI,finetuned_child_len_norm_z_trans_EMI,finetuned_submission_len_norm_z_trans_EMI,self_parent_len_norm_z_trans_EMI,self_child_len_norm_z_trans_EMI,self_submission_len_norm_z_trans_EMI,predictions_label_ft,predictions_score_ft,predictions_prob_disagree,predictions_ft
0,c7ecp8d,c7ecs2n,14m325,Ask_Politics,1355166551.0,karmanaut,Ramblin_Dash,karmanaut,0.053025,0.072843,...,0.514532,-0.116730,0.594431,0.624110,-0.011457,0.172262,0,0.6180,0.38180,no_disagreement
1,c7ecp8d,c7ecxpk,14m325,Ask_Politics,1355167049.0,zoolander951,Ramblin_Dash,karmanaut,0.040605,0.001874,...,0.514532,0.363800,0.594431,0.624110,0.012864,0.172262,0,0.7134,0.28640,no_disagreement
2,c7ecp8d,c7eflag,14m325,Ask_Politics,1355175187.0,zossima,Ramblin_Dash,karmanaut,0.037034,-0.026841,...,0.514532,0.571814,0.594431,0.624110,0.600814,0.172262,0,0.6580,0.34180,no_disagreement
3,c7efx8q,c7ejh1d,14m325,Ask_Politics,1355187644.0,fathermocker,senatorskeletor,karmanaut,0.046751,0.040446,...,0.269966,0.096978,0.594431,0.130100,0.070296,0.172262,1,0.9507,0.95070,disagree
4,c7efx8q,c7f8140,14m325,Ask_Politics,1355286254.0,PKMKII,senatorskeletor,karmanaut,0.029991,-0.043360,...,0.269966,0.644853,0.594431,0.130100,0.612353,0.172262,0,0.8580,0.14180,no_disagreement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4938799,kfkh5ib,kfmd52k,18uirzt,politics,1970-01-01 00:00:01.703977996,Knightforlife,freddiethebaer,optimalg,0.289315,0.222072,...,1.250681,0.671616,-0.580210,1.062374,0.498045,-0.857240,0,0.9550,0.04495,no_disagreement
4938800,kfklsb6,kforzbx,18uirzt,politics,1970-01-01 00:00:01.704023024,bakerfredricka,4blockhead,optimalg,0.192579,0.210383,...,1.367783,-0.043416,-0.580210,1.165189,-0.381238,-0.857240,0,0.9920,0.00774,no_disagreement
4938801,kfmq5kr,kfp6tzv,18uirzt,politics,1970-01-01 00:00:01.704032059,glassjar1,DoubleTFan,optimalg,0.078374,0.101258,...,-0.197753,-0.140261,-0.580210,0.242695,-0.122822,-0.857240,1,0.8500,0.85000,disagree
4938802,kfp3d4v,kfp8xh8,18uirzt,politics,1970-01-01 00:00:01.704033069,HaulinBoats,4blockhead,optimalg,0.137089,0.224533,...,0.600514,-0.617072,-0.580210,0.207855,-0.789836,-0.857240,0,0.9414,0.05875,no_disagreement


In [16]:
df_all_non_clipped.to_csv("../../../data/analysis_data/df_all_non_clipped.csv", index = False)

In [18]:
# reimport test

df_all_non_clipped_reimport = pd.read_csv("../../../data/analysis_data/df_all_non_clipped.csv")
df_all_non_clipped_reimport

  df_all_non_clipped_reimport = pd.read_csv("../../../data/analysis_data/df_all_non_clipped.csv")


Unnamed: 0,msg_id_parent,msg_id_child,submission_id,subreddit,exact_time_child,author_child,author_parent,author_submission,finetuned_child_E_norm,finetuned_child_I_norm,...,self_child_len_EMI_z_trans,self_submission_len_EMI_z_trans,finetuned_parent_len_norm_z_trans_EMI,finetuned_child_len_norm_z_trans_EMI,finetuned_submission_len_norm_z_trans_EMI,self_parent_len_norm_z_trans_EMI,self_child_len_norm_z_trans_EMI,self_submission_len_norm_z_trans_EMI,predictions_score_ft,predictions_ft
0,c7ecp8d,c7ecs2n,14m325,Ask_Politics,1355166551.0,karmanaut,Ramblin_Dash,karmanaut,0.053025,0.072843,...,-0.001232,0.214311,0.514532,-0.116730,0.594431,0.624110,-0.011457,0.172262,0.6180,no_disagreement
1,c7ecp8d,c7ecxpk,14m325,Ask_Politics,1355167049.0,zoolander951,Ramblin_Dash,karmanaut,0.040605,0.001874,...,0.024870,0.214311,0.514532,0.363800,0.594431,0.624110,0.012864,0.172262,0.7134,no_disagreement
2,c7ecp8d,c7eflag,14m325,Ask_Politics,1355175187.0,zossima,Ramblin_Dash,karmanaut,0.037034,-0.026841,...,0.726132,0.214311,0.514532,0.571814,0.594431,0.624110,0.600814,0.172262,0.6580,no_disagreement
3,c7efx8q,c7ejh1d,14m325,Ask_Politics,1355187644.0,fathermocker,senatorskeletor,karmanaut,0.046751,0.040446,...,0.101941,0.214311,0.269966,0.096978,0.594431,0.130100,0.070296,0.172262,0.9507,disagree
4,c7efx8q,c7f8140,14m325,Ask_Politics,1355286254.0,PKMKII,senatorskeletor,karmanaut,0.029991,-0.043360,...,0.725678,0.214311,0.269966,0.644853,0.594431,0.130100,0.612353,0.172262,0.8580,no_disagreement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4938799,kfkh5ib,kfmd52k,18uirzt,politics,1970-01-01 00:00:01.703977996,Knightforlife,freddiethebaer,optimalg,0.289315,0.222072,...,0.720232,-0.732976,1.250681,0.671616,-0.580210,1.062374,0.498045,-0.857240,0.9550,no_disagreement
4938800,kfklsb6,kforzbx,18uirzt,politics,1970-01-01 00:00:01.704023024,bakerfredricka,4blockhead,optimalg,0.192579,0.210383,...,-0.295144,-0.732976,1.367783,-0.043416,-0.580210,1.165189,-0.381238,-0.857240,0.9920,no_disagreement
4938801,kfmq5kr,kfp6tzv,18uirzt,politics,1970-01-01 00:00:01.704032059,glassjar1,DoubleTFan,optimalg,0.078374,0.101258,...,-0.075895,-0.732976,-0.197753,-0.140261,-0.580210,0.242695,-0.122822,-0.857240,0.8500,disagree
4938802,kfp3d4v,kfp8xh8,18uirzt,politics,1970-01-01 00:00:01.704033069,HaulinBoats,4blockhead,optimalg,0.137089,0.224533,...,-0.749237,-0.732976,0.600514,-0.617072,-0.580210,0.207855,-0.789836,-0.857240,0.9414,no_disagreement
