# Re-allocation
This is the final step of topic modeling. 

In the first step, we used a SOTA embedding model to retrieve the vector representations of all answers and implement a bunch of clustering models. By checking the intrinsic metrics, extrinsic metrics, and the clusters in 2D plots, we believe the HDBSCAN model with 100 dim of features and 60 min_sample/min_cluster size generate the best result. We then call the SOTA generative model to summarise the topic of each cluster. Then in the second step, we want to further combine, add, split, or fine-tune the topics to make them even more reasonable from the human perspective. We tried a series of re-summarisations and repeatedly review the results of re-allocation based on these different topics.  We finally decided on one set of topics. 

Now in the third step, we are going to do a final re-allocation of the points, keeping the original clustering information as much as possible while adapting to the re-summarised set of topics. 



## Read in the original clustering results, re-summarised set of topics

In [None]:
# the original clustering 
import pandas as pd
m_hdb10060_results=pd.read_csv("06 Data analysis/04 Topic Modeling/outputs/GPT_summarization/ds1_sub_clustering_sum/m_hdb_dm100_sz60_sub_all_rows.csv",
                               usecols=["text_id","cluster_topic"]).rename(columns = {'cluster_topic':'hdb10060_original'})
m_hdb10040_results=pd.read_csv("06 Data analysis/04 Topic Modeling/outputs/GPT_summarization/ds1_sub_clustering_sum/m_hdb_dm100_sz40_sub_all_rows.csv",
                               usecols=["text_id","cluster_topic"]).rename(columns = {'cluster_topic':'hdb10040_original'})
academic_text_id_list_hdb10040=list(m_hdb10040_results.query( "hdb10040_original=='Power Imbalance in Academic Grading'")["text_id"])

In [None]:
# all answers
text_df=pd.read_csv("06 Data analysis/00 data/python_datasets/dataset1_text_ID_created.csv")

#join the clustering of original HDB model to each answer
text_df=pd.merge(left=text_df,right=m_hdb10060_results,on="text_id",how="left")
text_df.loc[text_df["hdb10060_original"].isna(), "hdb10060_original"] = "Relevant_PNAS_0"
text_df.loc[text_df["hdb10060_original"] == "Experiences of Powerlessness and Regaining Control", "hdb10060_original"] = "Other"

import re 
text_df["hdb10060_original"]=[re.sub(r'"','',text)  for text in text_df["hdb10060_original"]]


In [None]:
#read in re-summarised set of topics
topics=pd.read_csv("06 Data analysis/04 Topic Modeling/outputs/GPT_summarization/ds1_sub_clustering_sum/m_hdb_dm100_sz60_sub_topics_info_Human_Refined.csv",
                   usecols=["topic_id","GPT_summ_raw", "renamed_no_power3_splitted"]).rename(columns = {'renamed_no_power3_splitted':'re_summarised_topic'})

In [None]:
#read in the pre-calculated pairwise similarities between each answer and each topic 
import pickle
with open("06 Data analysis/04 Topic Modeling/outputs/ds1_ex_post_guided_learning/pairwise_relevance/pw_relevance_topics_no_power3_splitted.DataFrame" , 'rb') as file:
    pw_sim=pickle.load(file)

## Re-allocate each answer

In [7]:
#joined the renamed topics (points are not re-allocated at this point), a temporary column

# text_df.drop(columns=["renamed_topics","renamed_topics"],inplace=True)
text_df=pd.merge(left=text_df,right=topics.drop(index=[0,2,4,10]),
                 left_on="hdb10060_original",right_on="GPT_summ_raw",
                how="left").drop(columns=["topic_id","GPT_summ_raw"]).rename(columns={ "re_summarised_topic":"renamed_topics"} )

In [8]:
topics.set_index( "topic_id",drop=False,inplace=True)

# the logic of re-allocating each row 
def re_allocate_each_answer( row):
# if clustered as academic grading in the HDB_100_40 model, directly keep the results 
    if row["text_id"] in academic_text_id_list_hdb10040:
        return topics.loc["Topic_13_Acade..."]["re_summarised_topic"]
# if clustered as noise or not included in clustering, we directly allocate it to the nearest topic
    if row["hdb10060_original"] in ["Other", "Relevant_PNAS_0","Experiences of Power Dynamics in Influence and Control Over Desired Outcomes"]:
        return topics.loc[pw_sim.loc[row["text_id"]].idxmax()]["re_summarised_topic"]
# if closest to academic grading or nonsense, directly allocate it to these two topics we added 
    if pw_sim.loc[row["text_id"]].idxmax() in ["Topic_13_Acade...","Topic_14_Nonse..."]:
        return topics.loc[pw_sim.loc[row["text_id"]].idxmax()]["re_summarised_topic"]
# merge the two topics related to job, workplace or professional context 
    if row["hdb10060_original"] in ["Professional Leadership and Decision-Making Experiences",
                                    "Feelings of powerlessness and lack of control in the workplace due to managerial decisions and authority."]:
        return topics.loc["Topic_1_Profe..."]["re_summarised_topic"]
# put the answer related to relationships into the most proper one 
    if row["hdb10060_original"] =="Exerting Control Over Dependents (Children, Siblings, or Pets)":
        return topics.loc[pw_sim[["Topic_5_Relat...","Topic_11_Relat...","Topic_15_Sibli…", "Topic_16_Pets…","Topic_17_Friends…"]].loc[row["text_id"]].idxmax()]["re_summarised_topic"]
    if row["hdb10060_original"] =="Experiencing Feelings of Powerlessness and Control in Relationships":
        return topics.loc[pw_sim[["Topic_11_Relat...", "Topic_17_Friends…"]].loc[row["text_id"]].idxmax()]["re_summarised_topic"]
# for the rest of points, keep the orginal clustering information and just return the modified names 
    return row["renamed_topics"]

In [9]:
text_df["re_allocation"]=text_df.apply(re_allocate_each_answer,axis=1 )

In [10]:
# we also create a column without any re-allocation of 
text_df["re_allocation_without_RelePANS0"]=text_df["re_allocation"]
text_df.loc[text_df["Relevant_PNAS"]==0, "re_allocation_without_RelePANS0"] = "Relevant_PNAS_0"

In [None]:
text_df.drop(columns=["renamed_topics"]).to_csv( "06 Data analysis/04 Topic Modeling/outputs/final_topic_modeling_results/ds1_topic_modeling_results.csv",index=False)

## Plots of topics X power conditions

We use Plotly to create our plots of topic per condition instead of Bertopic since we have customized clustering results. 


The default normalization in topics_per_class in bertopic uses l2 normalization to first normalize the frequencies of conditions in each topic and then plot the topics per condition using normalized values. In this way, the values per condition eliminate the differences in the size of different topics. 


We will use the same way in our customized codes. 


In [None]:
text_df=pd.read_csv("06 Data analysis/04 Topic Modeling/outputs/final_topic_modeling_results/ds1_topic_modeling_results.csv" )


In [13]:
text_df["Condition"].value_counts()


Condition
LP                 3460
HP                 2442
C (Grocery)        1440
C (Last Meal)       624
C (Equal Power)     434
Name: count, dtype: int64

In [14]:
abbreviations = {
    'HP': 'High Power',
    'LP': 'Low Power',
    'C (Grocery)': 'Grocery',
    'C (Last Meal)': 'Last Meal',
    'C (Equal Power)':'Equal Power'}

In [15]:
text_df['Condition'] = text_df['Condition'].replace(abbreviations)

In [16]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import normalize



# Group by class and topic to get the counts

def plot_normalized_topic_per_condition( clustering_result_df,condition_column_name='Condition',topic_column_name='re_allocation'):
    # first group a clustering result into the counts of each condition and each topic
    topics_per_class = clustering_result_df.groupby([condition_column_name,topic_column_name ]).size().reset_index(name='counts')
    # Define function to normalize the distributions on conditions by each topic 
    def normalize_group(group):
        group['normalized_frequency'] = normalize(group[['counts']], axis=0)
        return group
    topics_per_class = topics_per_class.groupby(topic_column_name).apply(normalize_group).reset_index(drop=True)
    
    #create plots
    fig = px.bar(topics_per_class, y=condition_column_name, x='normalized_frequency', color=topic_column_name, barmode='group', orientation='h',
             title='Topics per Condition', color_discrete_sequence= px.colors.qualitative.Light24)

    # Customize the layout
    fig.update_layout(
    width=1500,
    height=2000,
    yaxis_title='Condition',
    xaxis_title='Normalized Frequency',
    legend_title='Topic',
    template='plotly_white'
    )


    return fig



In [None]:
# all points 
plot_normalized_topic_per_condition( text_df).write_html( "06 Data analysis/04 Topic Modeling/outputs/visual_topicXcondition/ds1_topicXcondition_final.html")

In [None]:
# excluding PANS0 points 
plot_normalized_topic_per_condition( text_df[text_df["Relevant_PNAS" ]==1]).write_html( "06 Data analysis/04 Topic Modeling/outputs/visual_topicXcondition/ds1_topicXcondition_final_no_PNAS0.html")