# Pre Processing Functions

These functions are responsible for generating the objects needed for data visualization.

In [1]:
def create_per_topic_tuple_df(topic_level_sentiment, df_topics):
    """
    This function takes in two previously generated objects; a dictionary of 
    articles that contain the sentiment analysis value for each topic, and 
    a dataframe of articles (rows) and topics (columns) where each cell is the 
    relevancy of the topic for that article.
    
    It combines the two objects into one dataframe of articles (rows) and topics 
    (columns) where each cell is a tuple (sentiment, relevancy) of that article for 
    that topic. Returns the tuple dataframe.
    """
    # Turn to dataframe
    topic_level_sentiment_df = dictionary_to_data_frame(topic_level_sentiment)
    # Transpose rows and columns
    topic_level_sentiment_df = topic_level_sentiment_df.transpose()

    def to_int(str):
        return int(str)
    # Rename str names to ints
    topic_level_sentiment_df =  topic_level_sentiment_df.rename(to_int,
                                                                axis= 'columns')

    # Zip the two dfs together
    tuple_df = pd.concat([topic_level_sentiment_df, df_topics]).\
                        groupby(level=0).agg(tuple)
    return tuple_df


In [2]:
def preprocess_dataframe_for_datavis(dataframe, lda_model, corpus):
    """
    This function takes our generated sentiment analysis dataframe, and the LDA 
    model and corpus for the texts. It adds several variables to the dataframe 
    (Topics, Main Topic, Main Topic Score (relevancy), Shortened Address) and 
    then sorts them based on main topic for data visualization. Returns a new, 
    processed dataframe.
    """
    # Make deep copy of dataframe to prevent changes to original
    df = dataframe.copy(deep = True)

    df['Topics'] = lda_model.get_document_topics(corpus)

    sf = pd.DataFrame(data=df['Topics'])
    af = pd.DataFrame()

    df_topic_list = []
    df_score_list = []

    # Here we find most relevant topic for each article
    for ind in sf.index:
        rtl = sf['Topics'][ind]
        relevant_topic = -1
        relevant_topic_score = 0
        for (topic,score) in rtl:
            if score > relevant_topic_score:
                relevant_topic = topic
                relevant_topic_score = score
        df_topic_list.append(relevant_topic)
        df_score_list.append(relevant_topic_score)

    # We add main topic and the main topic's relevancy score
    # Add most relevant topic to df
    df['Main Topic'] = df_topic_list
    df['Main Topic Score'] = df_score_list

    associated_words = []
    for topic_id in df['Main Topic']:
        associated_words.append([([word for word, _ in lda_model.show_topic(topic_id)])])
    df['Associated Words'] = associated_words

    # Grab list of urls for parsing
    url_list = df['URL'].to_list()

    # Iterate through urls and parse them into their base site names
    for i, url in enumerate(url_list):
        if 'https://' in url:
            url = url.split("https://")[1]
            url_list[i] = url.split("/")[0]
        elif 'http://' in url:
            url = url.split("http://")[1]
            url_list[i] = url.split("/")[0]
        else:
            url_list[i] = url.split("/")[0]

    # Shorten the urls to make them easier to read in hover text
    df['Shortened Address'] = url_list

    # Sort df by main topic so it is in order in the graph.

    return df


In [3]:
def generate_topic_relevancy_dataframe(lda_model):
    """
    Taking in our LDA model, creates a new dataframe of all articles (rows) and 
    all topics (columns) where a cell corresponds to the relevancy of that 
    topic on that article. So cell [2,3] with a value of 0.988 means article 2 
    has a 0.988 relevancy score for Topic 3.
    """
    # Gather all topics per document as a list of lists of tuples
    document_topics = [lda_model.get_document_topics \
                       (item, minimum_probability = 0.0) for item in corpus]

    # Get the num of topics to add to df
    topic_cols = [x[0] for x in document_topics[0]]

    # Make df with topics
    df_topics = pd.DataFrame(columns = topic_cols)

    for i in document_topics:
        topic_scores = [x[1] for x in i]
        df_topics.loc[len(df_topics.index)] = topic_scores

    # Document_topics
    return df_topics


# Data Visualization Functions

Below contains our functions that generate data visualizations.

In [4]:
def visualize_all_articles_on_main_topic(df, plot_type='scatter'):
    """
    Taking in our (pre-processed) main dataframe, generates either a scatter or box plot
    of all articles sorted by their main topic and plotted along their sentiment values.
    Also returns useful information through hovertext.
    
    Args:
        df (pandas.DataFrame): The DataFrame containing the article data.
        plot_type (str): Type of plot ('scatter' or 'box'). Default is 'scatter'.
    """
    # Define a common category order for the main topics
    common_category_order = df['Main Topic'].unique()

    if plot_type == 'scatter':
        # Create a scatter plot with x, y, and color from our df.
        fig = px.scatter(df, x="Main Topic", y="Sentiment Score",
                         size="Main Topic Score",
                         custom_data=['ID',
                                      'Article Title',
                                      'Associated Words',
                                      'Shortened Address',
                                      'Sentiment Label',
                                      'Main Topic Score'],
                         title="Articles Sorted By Main Topic")
        # Set the y-axis range to make it symmetrical around zero
        y_max = df['Sentiment Score'].abs().max()
        fig.update_yaxes(range=[-y_max, y_max])
            # Set the hover text to show what was in custom_data
        fig.update_traces(hovertemplate="<br>".join(\
                                ["ID: %{customdata[0]}",
                                 "Article Title: %{customdata[1]}",
                                 "Associated Words: %{customdata[2]}",
                                 "Address: %{customdata[3]}",
                                 "Sentiment Label: %{customdata[4]}",
                                 "Main Topic Score : %{customdata[5]}"]))

    elif plot_type == 'box':
        # Create a box plot with main topic on the x-axis and sentiment score on the y-axis
        fig = px.box(df, x="Main Topic", y="Sentiment Score",
                     category_orders={"Main Topic": common_category_order},  # Set category order
                     title="Articles Sorted By Main Topic",
                     labels={'Main Topic': 'Main Topic', 'Sentiment Score': 'Sentiment Score'})
        # Set the y-axis range to make it symmetrical around zero
        y_max = df['Sentiment Score'].abs().max()
        fig.update_yaxes(range=[-y_max, y_max])
    else:
        raise ValueError("Invalid plot_type. Choose 'scatter' or 'box'.")


    # Show the plot
    fig.show()

In [5]:
def visualize_topic_cluster_tsne(lda_model, df):
    """
    Taking in both our main (pre-processed) dataframe and our LDA model, 
    generates a topic clustering graph based on the topic relevancy of 
    each article's topic. Displays the clustering in a 2D space.
    """
    num_topics = lda_model.num_topics

    # Get Topic Weights
    topic_weights = []
    for i in df["Topics"]:
        per_doc_list = [None] * num_topics
        #print(len(per_doc_list))
        for x in i:
            #print(x)
            per_doc_list[x[0]] = x[1]
        topic_weights.append(per_doc_list)

    # Array of topic weights  
    arr = pd.DataFrame(topic_weights).fillna(0).to_numpy()

    # tSNE Model Creation
    tsne_model = TSNE(n_components=2, verbose=1,
                  random_state=0, angle=.99, 
                  init='pca', perplexity = (arr.shape[0] - 1) / 3)

    tsne_lda = tsne_model.fit_transform(arr)

    # Formatting
    # Turn them into ints so we can sort by main topic, then back to str
    df['Main Topic'] = df['Main Topic'].apply(int)

    # Sort by main topic to make the legend pretty
    df = df.sort_values(by=['Main Topic'],ascending = True)

    # This makes it so we can use main topic as categorical data
    df['Main Topic'] = df['Main Topic'].apply(str)

    # Creating the cluster graph in plotly
    fig_cluster = px.scatter(df, x = tsne_lda[:,0],y = tsne_lda[:,1],
                        custom_data = ['ID',
                                       'Article Title',
                                       'Associated Words',
                                       'Shortened Address',
                                       'Sentiment Label',
                                       'Main Topic Score'],
                        color = "Main Topic",
                        size = "Main Topic Score",
                        title = "Topic Clustering Graph",
                        labels = dict(color = "Main Topic"))

    # Set the hover text to show whatwas in custom_data
    fig_cluster.update_traces(hovertemplate=\
                              "<br>".join(["ID: %{customdata[0]}",
                                           "Article Title: %{customdata[1]}",
                                           "Associated Words: %{customdata[2]}",
                                           "Address: %{customdata[3]}",
                                           "Sentiment Label: %{customdata[4]}",
                                           "Main Topic Score: %{customdata[5]}"]))

    fig_cluster.show()

    # Turn back to int
    df['Main Topic'] = df['Main Topic'].apply(int)

In [None]:
def visualize_topic_cluster_tsne_3d(lda_model, df):
    """
    Taking in both our main (pre-processed) dataframe and our LDA model, 
    generates a topic clustering graph based on the topic relevancy of 
    each article's topic. Then applies each article's sentiment value score 
    to make a 3D visualization. X-axis and y-axis represents the topic space
    and article clustering, while the z-axis is the article's sentiment score.
    """
    num_topics = lda_model.num_topics

    # Get Topic Weights
    topic_weights = []
    for i in df["Topics"]:
        per_doc_list = [None] * num_topics
        #print(len(per_doc_list))
        for x in i:
            #print(x)
            per_doc_list[x[0]] = x[1]
        topic_weights.append(per_doc_list)

    # Array of topic weights  
    arr = pd.DataFrame(topic_weights).fillna(0).to_numpy()

    # tSNE Model Creation
    tsne_model = TSNE(n_components=2, verbose=1,
                  random_state=0, angle=.99, 
                  init='pca', perplexity = (arr.shape[0] - 1) / 3)

    tsne_lda = tsne_model.fit_transform(arr)

    # Formatting
    # Turn them into ints so we can sort by main topic, then back to str
    df['Main Topic'] = df['Main Topic'].apply(int)

    # Sort by main topic to make the legend pretty
    df = df.sort_values(by=['Main Topic'],ascending = True)

    # This makes it so we can use main topic as categorical data
    df['Main Topic'] = df['Main Topic'].apply(str)

    # Creating the cluster graph in plotly
    fig_cluster = px.scatter_3d(df, x = tsne_lda[:,0],
                                y = tsne_lda[:,1],
                                z = "Sentiment Score",
                        custom_data = ['ID',
                                       'Article Title',
                                       'Associated Words',
                                       'Shortened Address',
                                       'Sentiment Label',
                                       'Main Topic'],
                        color = "Main Topic",
                        size = "Main Topic Score",
                        title = "Topic Clustering Graph",
                        labels = dict(color = "Main Topic"))

    # Set the hover text to show what was in custom_data
    fig_cluster.update_traces(hovertemplate=\
                              "<br>".join(["ID: %{customdata[0]}",
                                           "Article Title: %{customdata[1]}",
                                           "Associated Words: %{customdata[2]}",
                                           "Address: %{customdata[3]}",
                                           "Sentiment Label: %{customdata[4]}",
                                           "Main Topic: %{customdata[5]}"]))

    fig_cluster.show()

    # Turn back to int
    df['Main Topic'] = df['Main Topic'].apply(int)


In [6]:
def visualize_single_topic_subjectivity_vs_sentiment(df, topic_num):
    """
    Taking in the main (pre-processed) dataframe and an int corresponding to 
    the topic number you want to visualize, generates a 2D plot of all articles 
    with that topic number as their main topic, plotted along their 
    sentiment scores and their subjectivity scores (higher is more subjective).
    """
    df_topic = df[df['Main Topic'] == topic_num]


    t_string = "Sentiment Analysis on Topic " + str(topic_num)
    fig = px.scatter(df_topic,
                     x = "Subjectivity Score",
                     y = "Sentiment Score",
                     size = "Main Topic Score",
                     hover_name = "Shortened Address",
                     title = t_string,
                     custom_data = ['ID',
                                    'Article Title',
                                    'Associated Words',
                                    'Shortened Address',
                                    'Sentiment Score',
                                    'Subjectivity Score',
                                    'Main Topic'])

    # Set the hover text to show what was in custom_data
    fig.update_traces(hovertemplate=\
                              "<br>".join(["ID: %{customdata[0]}",
                                           "Article Title: %{customdata[1]}",
                                           "Associated Words: %{customdata[2]}",
                                           "Address: %{customdata[3]}",
                                           "Sentiment Score: %{customdata[4]}",
                                           "Subjectivity Score: %{customdata[5]}",
                                           "Main Topic: %{customdata[6]}"]))
                              
    fig.show()


In [None]:
def visualize_topic_word_cloud(lda_model, topic_id):
    """
    Taking in the lda model as well as the topic we want to visualize, creates a word cloud of the
    top 10 words in that topic. The words are weighted (by size) by their relevance to the topic.
    """
    # Initialize the WordCloud generator
    cloud = WordCloud(background_color='white',
                    width=2500,
                    height=1800,
                    max_words=10,
                    colormap='tab10',  # You can choose a different colormap
                    prefer_horizontal=1.0)
    
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))  # Create a single plot
    topic_words = dict(lda_model.show_topic(topic_id, topn=10))  # Get the top words for the topic
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    ax.imshow(cloud)
    ax.set_title('Topic ' + str(topic_id), fontdict=dict(size=16))
    ax.axis('off')
    plt.tight_layout()
    plt.show()


# K-means clustering and helper functions

In [7]:
def visualize_optimal_cluster_count(df_topics, max_clusters, pca_components):
    """
    Taking in our topic relevancy dataframe, the max cluster count that you 
    want to stop at, and the number of PCA components to use, generates a 
    visualization of the inertia score of our clustering model for every number 
    of clusters up to the max.
    """
    wcss = []

    # Set to number of components
    pca = PCA(pca_components)
    # Apply principled component analysis
    data = pca.fit_transform(df_topics)

    for i in range(2, max_clusters):
        model = KMeans(n_clusters = i, init = "k-means++", n_init = 10)
        model.fit(data)
        wcss.append(model.inertia_)

    # Plot inertia for the different number of clusters
    plt.figure(figsize=(10,10))
    plt.plot(range(2, max_clusters), wcss)
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()


In [8]:
def visualize_kmeans_clustering(dataframe, df_topics, num_clusters, pca_components):
    """
    Taking in our topic relevancy dataframe, the number of clusters, and the 
    number of PCA components, generates a k-means clustering using those 
    parameters and visualizes it in a 2D space. The clustering is based solely 
    on the topic relevancy for each article. Also generates centers of each 
    cluster shown with an 'X'.
    """
    # Make deep copy of our dataframe to add new temp variables
    main_df = dataframe.copy(deep=True)
    # Sort by ID to make sure the 2 dfs are aligned when we merge them
    main_df = main_df.sort_values('ID')

    # Dimension reduction to [pca_components] variables
    pca = PCA(pca_components)

    # Apply principled component analysis to topic relevancy
    data = pca.fit_transform(df_topics)

    # Creating KMeans model
    model = KMeans(n_clusters = num_clusters, init = "k-means++", n_init = 10)
    kmeans = model.fit_predict(data)
    centers = np.array(model.cluster_centers_)

    # Transform into dataframe to add to our main dataframe
    data_df = pd.DataFrame({'x': data[:,0], 'y': data[:,1]})

    # Gather the cluster information for the kmeans clusters
    data_df['Cluster'] = kmeans

    # Getting columns to add to main dataframe
    x_col = data_df['x']
    y_col = data_df['y']
    cluster_col = data_df['Cluster']

    # Add clustering data to our main dataframe
    main_df = main_df.join(x_col)
    main_df = main_df.join(y_col)
    main_df = main_df.join(cluster_col)
    
    # Sort by cluster to make the legend pretty
    main_df = main_df.sort_values(by=['Cluster'],ascending = True)
    
    # This makes it so we can use cluster as categorical data (for the purpose of making a legend)
    main_df['Cluster'] = main_df['Cluster'].apply(str)

    # Generate our scatter plot
    fig = px.scatter(main_df, x = 'x',
                     y = 'y',
                     color = 'Cluster',
                     labels = dict(color = "Cluster"),
                     custom_data = ['ID',
                                    'Article Title',
                                    'Associated Words',
                                    'Shortened Address',
                                    'Sentiment Label',
                                    'Main Topic'],)

    # Add hover text
    fig.update_traces(hovertemplate=\
                              "<br>".join(["ID: %{customdata[0]}",
                                           "Article Title: %{customdata[1]}",
                                           "Associated Words: %{customdata[2]}",
                                           "Address: %{customdata[3]}",
                                           "Sentiment Label: %{customdata[4]}",
                                           "Main Topic: %{customdata[5]}"]))


    # Turn cluster variable back into int
    main_df['Cluster'] = main_df['Cluster'].apply(int)
    
    fig.show()


In [None]:
def visualize_kmeans_clustering_3d(dataframe, df_topics, num_clusters, pca_components):
    """
    Taking in our main article dataframe, topic relevancy dataframe, the number of clusters, and the 
    number of PCA components, generates a k-means clustering of articles by topic using those 
    parameters, and visualizes it in a 3D space. The clustering is based solely 
    on the topic relevancy for each article (colors each article by cluster). Then applies a z-axis
    representing the sentiment value for each article. Produces a 3D scatter plot from this.
    """
    # Make deep copy of our dataframe to add new temp variables
    main_df = dataframe.copy(deep=True)
    # Sort by ID to make sure the 2 dfs are aligned when we merge them
    main_df = main_df.sort_values('ID')

    # Dimension reduction to [pca_components] variables
    pca = PCA(pca_components)

    # Apply principled component analysis to topic relevancy
    data = pca.fit_transform(df_topics)

    # Creating KMeans model
    model = KMeans(n_clusters = num_clusters, init = "k-means++", n_init = 10)
    kmeans = model.fit_predict(data)
    centers = np.array(model.cluster_centers_)

    # Transform into dataframe to add to our main dataframe
    data_df = pd.DataFrame({'x': data[:,0], 'y': data[:,1]})

    # Gather the cluster information for the kmeans clusters
    data_df['Cluster'] = kmeans

    # Getting columns to add to main dataframe
    x_col = data_df['x']
    y_col = data_df['y']
    cluster_col = data_df['Cluster']

    # Add clustering data to our main dataframe
    main_df = main_df.join(x_col)
    main_df = main_df.join(y_col)
    main_df = main_df.join(cluster_col)
    
    # Sort by cluster to make the legend pretty
    main_df = main_df.sort_values(by=['Cluster'],ascending = True)
    
    # This makes it so we can use cluster as categorical data (to make a legend)
    main_df['Cluster'] = main_df['Cluster'].apply(str)

    # Generate our scatter plot
    fig = px.scatter_3d(main_df, x = 'x',
                     y = 'y',
                     z = 'Sentiment Score',
                     color = 'Cluster',
                     labels = {"Cluster" : "Cluster"},
                     custom_data = ['ID',
                                    'Article Title',
                                    'Associated Words',
                                    'Shortened Address',
                                    'Sentiment Label',
                                    'Main Topic'],)

    # Add hover text
    fig.update_traces(hovertemplate=\
                              "<br>".join(["ID: %{customdata[0]}",
                                           "Article Title: %{customdata[1]}",
                                           "Associated Words: %{customdata[2]}",
                                           "Address: %{customdata[3]}",
                                           "Sentiment Label: %{customdata[4]}",
                                           "Main Topic: %{customdata[5]}"]))

    # Turn cluster variable back into int
    main_df['Cluster'] = main_df['Cluster'].apply(int)
    
    fig.show()


# Sentiment X Topic Relevance

In [None]:
def multiply_tuples_in_dataframe(input_df):
    """
    Multiply the elements of tuples in the DataFrame to create a new DataFrame of float values.

    Args:
        input_df (pd.DataFrame): The input DataFrame containing tuples.

    Returns:
        pd.DataFrame: A new DataFrame with float values resulting from multiplying the tuples.
    """
    # Create a new DataFrame with the same structure as the input DataFrame
    result_df = pd.DataFrame(index=input_df.index, columns=input_df.columns)

    for column in input_df.columns:
        # Use apply to multiply the elements of each tuple and store the result in the new DataFrame
        result_df[column] = input_df[column].apply(lambda x: x[0] * x[1])

    return result_df


In [None]:
def plot_sentiment_relevance(data):
    """
    Plot a scatter plot of sentiment values against relevance values.

    Parameters:
    data (list of tuples): A list of tuples, where each tuple contains
    sentiment and relevance values.
        For example: [(0.5, 0.8), (0.3, 0.6), (-0.2, 0.4)]

    Returns:
    None

    This function takes a list of tuples, where each tuple represents a data point with
    sentiment and relevance values. It then creates a scatter plot to visualize the
    relationship between sentiment and relevance.

    Example Usage:
    >>> data = [(0.5, 0.8), (0.3, 0.6), (-0.2, 0.4)]
    >>> plot_sentiment_relevance(data)

    This will display a scatter plot with sentiment values on the x-axis and relevance 
    values on the y-axis, along with labels and a title for the plot.
    """

    # Separate the sentiment and relevance values into separate lists
    sentiments, relevances = zip(*data)

    # Create a scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(sentiments, relevances, c='blue', marker='o', alpha=0.5)

    # Add labels and title
    plt.title('Sentiment vs Relevance')
    plt.xlabel('Sentiment')
    plt.ylabel('Relevance')

    # Show the plot
    plt.show()


In [None]:
def dimension_reduction(doc_topic_matrix):
    """
    Perform dimension reduction on a document-topic matrix using Principal Component Analysis (PCA).

    Parameters:
    doc_topic_matrix (numpy.ndarray): A matrix where each column represents a topic and each row 
    represents a document.

    Returns:
    numpy.ndarray: A reduced-dimensional topic space representation of the input matrix.

    This function uses Principal Component Analysis (PCA) to reduce the dimensionality of the 
    document-topic matrix.
    It returns a new topic space with a reduced number of dimensions.

    Example Usage:
    >>> reduced_topic_space = dimension_reduction(doc_topic_matrix)
    """
    # Number of dimensions for topic space
    n_components = 2 

    pca = PCA(n_components=n_components)
    topic_space = pca.fit_transform(doc_topic_matrix.T)
    return topic_space

def plot_pca(topic_space):
    """
    Create a scatter plot of points in a reduced-dimensional topic space.

    Parameters:
    topic_space (numpy.ndarray): A reduced-dimensional topic space representation.

    Returns:
    None

    This function creates a scatter plot of points in a reduced-dimensional topic space, 
    providing a visualization of the relationships between topics or documents in a 
    lower-dimensional space.

    Example Usage:
    >>> plot_pca(reduced_topic_space)
    """

    topic_labels = [str(i) for i in range(len(topic_space))]

    # Create a scatterplot
    plt.figure(figsize=(8, 6))
    plt.scatter(*zip(*topic_space), marker='o', s=100, c='b')

    # Add labels for each point

    for i, (x, y) in enumerate(topic_space):
        plt.annotate(topic_labels[i], (x, y), 
                     textcoords="offset points",
                     xytext=(0, 10), ha='center')

    plt.title("Intertopic Distance Map")
    plt.xlabel("X-axis")
    plt.ylabel("Y-axis")

    plt.show()


In [4]:
def plot_documents(doc_topic_matrix, topic_space, df_main):
    """
    Create a scatter plot of documents in a reduced-dimensional space, with 
    color-coded sentiment scores.

    Parameters:
    doc_topic_matrix (numpy.ndarray): A matrix where each column represents a topic and 
    each row represents a document.
    topic_space (numpy.ndarray): A reduced-dimensional topic space representation.
    df_main (pandas.DataFrame): A DataFrame containing document information, including 
    sentiment scores.

    Returns:
    None

    This function takes a document-topic matrix, a reduced-dimensional topic space, 
    and a DataFrame with document information.
    It calculates the coordinates for each document in the topic space, color-codes them 
    based on sentiment scores, and creates a scatter plot to visualize the relationship 
    between documents and topics.

    Example Usage:
    >>> plot_documents(doc_topic_matrix, reduced_topic_space, df_main)
    """

    colors = []
    # Calculate the coordinates for each document
    n_docs, n_topics = doc_topic_matrix.shape
    document_space = []
    for doc_id in range(n_docs):
        x = 0
        y = 0
        for topic_id in range(n_topics):
            # Multiply the coordinates of the topic by the relavence of the topic
            x += topic_space[topic_id][0] * doc_topic_matrix[topic_id][doc_id] 
            y += topic_space[topic_id][1] * doc_topic_matrix[topic_id][doc_id]
        document_space.append((x,y))

    colormap = mcolors.LinearSegmentedColormap.from_list('rg',["r", "w", "g"], N=256)
    for sentiment in df_main["Sentiment Score"]:
        # Normalize sentiment score to the range [0, 1]
        normalized_sentiment = (sentiment + 0.2) / 0.4  # Map -0.2 to 0.2 to [0, 1]
        sentiment_color = colormap(normalized_sentiment, alpha=1.0)
        colors.append(sentiment_color)



    x_coords, y_coords = zip(*document_space)

    # Create a scatter plot
    
    plt.scatter(x_coords, y_coords, marker='o', label='Documents', c=colors)
    plt.xlabel('X-coordinate')
    plt.ylabel('Y-coordinate')
    plt.title('Scatter Plot of Data Points')

    # Plot topics
    x_coords_topics = [x for x, _ in topic_space]
    y_coords_topics = [y for _, y in topic_space]
    plt.scatter(x_coords_topics, y_coords_topics, color='black', marker='o', s=5, label='Topics')

    # Display the plot
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
def plot_topic_documents(df, topic_space):
    """
    Create a scatter plot of every topic in every docuement in a reduced-dimensional
    topic space with color-coded sentiment scores.

    Parameters:
    df (pandas.DataFrame): A DataFrame containing document information, 
    including topic relevance and sentiment scores.
    topic_space (numpy.ndarray): A reduced-dimensional topic space representation.

    Returns:
    None

    This function takes a DataFrame with document information, including topic relevance 
    and sentiment scores, and a reduced-dimensional topic space. It calculates the 
    coordinates for each document in the topic space, filters based on topic relevance,
    and color-codes the points based on sentiment scores. It then creates a scatter plot to
    visualize the relationship between documents and topics. The further from the center 
    the points are, the more relavent they are, while the closer they are, the more abstract 
    the documents are.

    Example Usage:
    >>> plot_topic_documents(df, reduced_topic_space)
    """

    colormap = mcolors.LinearSegmentedColormap.from_list('rg',["r", "w", "g"], N=256)
    colors = []

    # Create separate DataFrames for the first and second values
    doc_topic_matrix = pd.DataFrame()
    sentiment_matrix = pd.DataFrame()

    for column in df.columns:
        doc_topic_matrix[column] = df[column].apply(lambda x: x[0])
        sentiment_matrix[column] = df[column].apply(lambda x: x[1])

    # Calculate the coordinates for each document
    n_docs, n_topics = doc_topic_matrix.shape
    document_topic_space = []
    for doc_id in range(n_docs):
        for topic_id in range(n_topics):
            if doc_topic_matrix[topic_id][doc_id] > 0.1:
                # Multiply the coordinates of the topic by the relavence of the topic
                x = topic_space[topic_id][0] * doc_topic_matrix[topic_id][doc_id] 
                y = topic_space[topic_id][1] * doc_topic_matrix[topic_id][doc_id]
                document_topic_space.append((x,y))
                # Get color of point based on the sentiment
                sentiment_color = colormap(sentiment_matrix[topic_id][doc_id])
                colors.append(sentiment_color)


    x_coords, y_coords = zip(*document_topic_space)

    # Create a scatter plot
    plt.scatter(x_coords, y_coords, marker='o', label='Data Points', c=colors)
    plt.xlabel('X-coordinate')
    plt.ylabel('Y-coordinate')
    plt.title('Scatter Plot of Data Points')

    # Display the plot
    plt.legend()
    plt.grid(True)
    plt.show()
    

In [None]:
def plot_topic_heatmap(doc_topic_matrix, topic_space):
    """
    Create a scatter plot of documents in a reduced-dimensional topic space,
    with color-coded sentiment scores.

    Parameters:
    df (pandas.DataFrame): A DataFrame containing document information, including 
    topic relevance and sentiment scores.
    topic_space (numpy.ndarray): A reduced-dimensional topic space representation.

    Returns:
    None

    This function takes a DataFrame with document information, including topic
    relevance and sentiment scores, and a reduced-dimensional topic space. It 
    calculates the coordinates for each document in the topic space, filters based
    on topic relevance, and color-codes the points based on sentiment scores. 
    It then creates a scatter plot to visualize the Density of documents in the topic space.

    Example Usage:
    >>> plot_topic_documents(df, reduced_topic_space)
    """
    # Calculate the coordinates for each document
    n_docs, n_topics = doc_topic_matrix.shape
    x_coords = []
    y_coords = []

    for doc_id in range(n_docs):
        x = 0
        y = 0

        for topic_id in range(n_topics):
            # Multiply the coordinates of the topic by the relevance of the topic
            x += topic_space[topic_id][0] * doc_topic_matrix[topic_id][doc_id]
            y += topic_space[topic_id][1] * doc_topic_matrix[topic_id][doc_id]

        x_coords.append(x)
        y_coords.append(y)

    # Create a 2D grid for the heatmap
    x_min, x_max = -max(x_coords), max(x_coords)
    y_min, y_max = -max(y_coords), max(y_coords)

    resolution = 2
    x_range = int((x_max - x_min) * resolution) + 1
    y_range = int((y_max - y_min) * resolution) + 1

    heatmap = [[0 for _ in range(x_range + 1)] for _ in range(y_range + 1)]

    for x, y in zip(x_coords, y_coords):
        x_idx = int((x * resolution) + x_range/2)
        y_idx = int((y * resolution) + y_range/2)
        heatmap[y_idx][x_idx] += 1

    plt.figure(figsize=(10, 8))
    colors = ["white", "red"]
    colormap = mcolors.LinearSegmentedColormap.from_list("custom", colors, N=256)
    plt.imshow(heatmap, cmap=colormap, extent=[x_min, x_max, y_min, y_max], origin='lower')
    plt.colorbar(label='Topic_Density')

    # Plot topics
    x_coords_topics = [x for x, _ in topic_space]
    y_coords_topics = [y for _, y in topic_space]
    plt.scatter(x_coords_topics,
                y_coords_topics, 
                color='black', 
                marker='o', 
                s=10, 
                label='Data Points')


    plt.xlabel('X-coordinate')
    plt.ylabel('Y-coordinate')
    plt.title('Topic Heatmap')

    # Display the plot
    plt.show()


In [None]:
def plot_sentiment_heatmap(doc_topic_matrix, topic_space, df_main):
    """
    Create a heatmap of sentiment scores in a reduced-dimensional topic space.

    Parameters:
    doc_topic_matrix (numpy.ndarray): A matrix where each column represents a topic and each row represents a document.
    topic_space (numpy.ndarray): A reduced-dimensional topic space representation.
    df_main (pandas.DataFrame): A DataFrame containing document information, including sentiment scores.

    Returns:
    None

    This function takes a document-topic matrix, a reduced-dimensional topic space, and a DataFrame with document
    information, including sentiment scores. It calculates the coordinates for each document in the topic space and
    creates a heatmap of sentiment scores in that space.

    Example Usage:
    >>> plot_sentiment_heatmap(doc_topic_matrix, reduced_topic_space, df_main)
    """
    # Calculate the coordinates for each document
    n_docs, n_topics = doc_topic_matrix.shape
    x_coords = []
    y_coords = []
    sentiment_scores = df_main["Sentiment Score"]

    for doc_id in range(n_docs):
        x = 0
        y = 0

        for topic_id in range(n_topics):
            # Multiply the coordinates of the topic by the relevance of the topic
            x += topic_space[topic_id][0] * doc_topic_matrix[topic_id][doc_id]
            y += topic_space[topic_id][1] * doc_topic_matrix[topic_id][doc_id]

        x_coords.append(x)
        y_coords.append(y)

    # Create a 2D grid for the heatmap
    x_min, x_max = -max(x_coords), max(x_coords)
    y_min, y_max = -max(y_coords), max(y_coords)

    resolution = 2
    x_range = int((x_max - x_min) * resolution) + 1
    y_range = int((y_max - y_min) * resolution) + 1

    heatmap = [[0 for _ in range(x_range + 1)] for _ in range(y_range + 1)]
    heatmap_normalization_factor = [[0 for _ in range(x_range + 1)] for _ in range(y_range + 1)]

    for x, y, sentiment in zip(x_coords, y_coords, sentiment_scores):
        x_idx = int((x * resolution) + x_range/2)
        y_idx = int((y * resolution) + y_range/2)
        heatmap[y_idx][x_idx] += sentiment
        heatmap_normalization_factor[y_idx][x_idx] += 1
    
    # Normalize the values to get an average sentiment
    for y_idx in range(len(heatmap)):
        for x_idx in range(len(heatmap[y_idx])):
            if heatmap_normalization_factor[y_idx][x_idx] != 0:
                heatmap[y_idx][x_idx] /= heatmap_normalization_factor[y_idx][x_idx]
            

    # Specify the colors for -1 (red), 0 (white), and 1 (green)
    colors = ["red", "white", "green"]
    colormap = mcolors.LinearSegmentedColormap.from_list("custom", colors, N=256)

    sentiment_max = max(abs(min(sentiment_scores)), max(sentiment_scores))

    # Create a heatmap using the custom colormap
    plt.imshow(heatmap, cmap=colormap, extent=[x_min, x_max, y_min, y_max], vmin=-sentiment_max, vmax=sentiment_max, origin='lower', aspect='auto')
    plt.colorbar(label='Sentiment Score')

    # Plot topics
    x_coords_topics = [x for x, _ in topic_space]
    y_coords_topics = [y for _, y in topic_space]
    plt.scatter(x_coords_topics, y_coords_topics, color='black', marker='o', s=10, label='Data Points')

    plt.xlabel('X-coordinate')
    plt.ylabel('Y-coordinate')
    plt.title('Sentiment Heatmap')

    # Display the plot
    plt.show()