#### Import Libraries

In [None]:
import pandas as pd

#### Import Data

In [None]:
content = pd.read_csv(f"../data/content_filtered.csv")

#### Create combine function

In [None]:
def combine(correlations, topics, content):
    '''
    - Inputs our three datasets and combines the topic/content information with the topic/content correlations data.
    - All topic/content information is concatenated to one "features" column, which includes the language, title, description, etc.
    - Output includes the correlations topics information, correlations content information, and a dictionary to convert indices to their
      corresponding topic/content id.
    '''
    #Drop/combine columns
    content["text"] = content["text"].fillna('')
    content = content.dropna()
    content_combined = content["language"] + " " + content["title"] + " " + content["description"] + " " + content["text"]
    content_combined = pd.DataFrame({"id":content["id"], "features":content_combined})

    topics["description"] = topics["description"].fillna('')
    topics = topics.dropna()
    topics_combined = topics["language"] + " " + topics["channel"] + ' ' + topics["title"] + " " + topics["description"]
    topics_combined = pd.DataFrame({"id":topics["id"], "features":topics_combined})

    #Explode correlations rows
    correlations["content_ids"] = correlations["content_ids"].str.split()
    correlations = correlations.explode("content_ids")

    #Merge
    merged = correlations.merge(topics_combined, how="inner", left_on="topic_id", right_on="id")
    merged = merged.reset_index().merge(content_combined, how="inner", left_on="content_ids", right_on="id", sort=False, suffixes=("_topics", "_content")).sort_values(axis=0, by="index")
    merged = merged.drop(["content_ids", "topic_id"], axis=1)

    #Split
    corr_topics = merged[['index', 'features_topics']]
    corr_topics.columns = ['id', 'features']
    corr_content = merged[['index', 'features_content']]
    corr_content.columns = ['id', 'features']

    index_to_topic = pd.Series(merged.id_topics.values, index=merged.index).to_dict()
    index_to_content = pd.Series(merged.id_content.values, index=merged.index).to_dict()

    return corr_topics, corr_content, index_to_topic, index_to_content