In [None]:
%load_ext autoreload
%autoreload 2


import sys
import os

# Go one level up from 'notebooks' to the project root, then into 'src'
sys.path.append(os.path.abspath('../src'))

import meneame as mn
import s3_create_attitudes as s3

import s4_figures_paper as s4

import pandas as pd

import networkx as nx
import numpy as np
import pylab as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
import textalloc as ta

# Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [None]:
# Set up paths and topics to study
# Read folder paths from the file
with open('../dirs.txt', 'r') as f:
    lines = [line.strip() for line in f]

path = lines[0]

# Define the data path
path_data = path + 'data_snapshot/'
path_save_embeddings = "../data/embeddings/"
path_figures = lines[1]


topics = ["Politics", "Russia",  "Public services", "Crime", "Crypto/net", "Inflation"]

In [None]:
## Read idelogoy from pollitical watch
df = pd.read_csv("../ideology_twitter/media_ideology_politicalwatch/media_general_topics/Filtered_Sources.csv")
df = df[["domain", "bias"]]
df["domain"] = df["domain"].str.split(".").str[1]
ideology_media_watch = df.set_index("domain").to_dict()["bias"]

## Read users-story data
data_st = mn.read_comments_topics(path_data, bipartite=True)#, min_time="2022-12-01", max_time="2023-08-01")
data_st = data_st.loc[(data_st["story_vote_time"] >= "2022-12-01") &
                      (data_st["story_vote_time"] <= "2023-08-01")]

print(data_st.agg({"story_id": ["count", "nunique"], "username_vote": "nunique"}))

# # Read user-user data, merged with topics
data_com = mn.read_comments_topics(path_data)
#data_com = data_com.loc[data_com["story_id"].isin(set(data_st["story_id"]))]
data_com = data_com.loc[(data_com["comment_vote_time"] >= "2022-12-01") &
                      (data_com["comment_vote_time"] <= "2023-08-01")]

## Descriptive statistis of the user-story data
data_stats = (data_st
           .groupby("final_topic")
           .agg({"story_id": ["nunique", "count"],
                 "story_vote": [lambda x: np.sum(x > 0), lambda x: np.sum(x < 0)]})
           .sort_values(by=("story_id", "nunique"), ascending=False)
           .reset_index()
)
data_stats.columns = ["Topic", "Number stories", "Number votes:", "Number upvotes", "Number downvotes"]

data_stats = data_stats.applymap(
    lambda x: f"{x:,.0f}" if isinstance(x, (int, float)) else x
)

# Save to latex, add comma to numbers and a descriptive captions
print(data_stats.to_latex(index=False, caption="Statistics of the stories in the dataset", label="tab:data_stats",
                    column_format="lcccc")
)

# Keep domains to filter later on
domains = set(data_st["story_original_url_domain"].unique())
domains_users = set(data_com["username_post"].unique()) | set(data_com["username_vote"].unique())

# Create a dictionary to color the nodes according to the average vote
# +1 = always postiive, -1 = always negative. For comments cutoff at 0 since there are few negative votes
nodes_st_color = data_st.groupby("username_vote")["story_vote"].mean()

nodes_com_color = data_com.groupby("username_vote")["comment_vote"].mean()
nodes_com_color[nodes_com_color<0] = 0
nodes_color = nodes_com_color.to_dict()

In [None]:
data = mn.read_comments_topics(path_data)
data = data.loc[(data["comment_vote_time"] >= "2022-12-01") &
                (data["comment_vote_time"] <= "2023-08-01")]

# Agreement
data["positive"] = data.groupby("story_id")["comment_vote"].transform(
    np.mean
)

# Keep domains to filter later on
domains_users = set(data["username_post"].unique()) | set(data["username_vote"].unique())

sns.histplot(data["positive"], bins=200)

# Only publised
data["topic"] = data["final_topic"]  # .map(mn.topic_names) # Add name of each topic
data = data.loc[data["final_topic"].isin(set(topics))]

print(data["final_topic"].value_counts())

In [None]:
# Create SHEEP embeddings
df_all_embeddings_sheep = mn.create_embeddings_third(
        data,
        method="sheep",
        bipartite=False,
        min_votes_from_user=10,
        min_votes_to_users_or_domains=0,
        min_comments_or_stories=10,
        plot_sheep=False,
        normalize_laplacian=True,
        adjust_weight=False,
    )

df_all_embeddings_sheep = df_all_embeddings_sheep.reset_index().rename(columns={"user": "story"})


df_all_embeddings_sheep_adj = mn.create_embeddings_third(
        data,
        method="sheep-adj",
        bipartite=False,
        min_votes_from_user=10,
        min_votes_to_users_or_domains=0,
        min_comments_or_stories=10,
        plot_sheep=False,
        normalize_laplacian=True,
        adjust_weight=False,
        adj_weight = -50
    )

df_all_embeddings_sheep_adj = df_all_embeddings_sheep_adj.reset_index().rename(columns={"user": "story"})

df_all_embeddings_ca = mn.create_embeddings_third(
        data,
        method="ca",
        bipartite=False,
        min_votes_from_user=10,
        min_votes_to_users_or_domains=0,
        min_comments_or_stories=10,
    )

df_all_embeddings_ca  = df_all_embeddings_ca.reset_index().rename(columns={"user": "story"})

df_all_embeddings_sheep_m, outliers_sheep = s3.add_values(df_all_embeddings_sheep)

df_all_embeddings_sheep_adj_m, outliers_sheep_adj = s3.add_values(df_all_embeddings_sheep_adj)

df_all_embeddings_ca_m, outliers_ca = s3.add_values(df_all_embeddings_ca)

df_pca_emb_sheep = mn.create_pca_emb(
    df_all_embeddings_sheep_m,
    topics=topics, domains=domains_users, normalize=True, emb="sheep"
)

df_pca_emb_sheep_adj = mn.create_pca_emb(
    df_all_embeddings_sheep_adj_m,
    topics=topics, domains=domains_users, normalize=True, emb="sheep"
)

In [None]:
#plt.figure(figsize=(4,12))
d_color = pd.Series(0.5, index=df_all_embeddings_ca.index)

sc = StandardScaler()
df_all_embeddings_ca_m[df_all_embeddings_ca_m.columns[1:]] = sc.fit_transform(df_all_embeddings_ca_m[df_all_embeddings_ca_m.columns[1:]])

df_pca_emb_sheep[df_pca_emb_sheep.columns[1:]] = sc.fit_transform(df_pca_emb_sheep[df_pca_emb_sheep.columns[1:]])

df_pca_emb_sheep_adj[df_pca_emb_sheep_adj.columns[1:]] = sc.fit_transform(df_pca_emb_sheep_adj[df_pca_emb_sheep_adj.columns[1:]])

In [None]:
plt.figure(figsize=(18, 4))  # Wider figure for 3 plots
plt.subplot(1, 3, 1)
s4.plot_two_dim(df_all_embeddings_ca, df_pca_emb_sheep,  flip=None, domains=domains_users,
             top="Russia", suffix1="_0", suffix2="_pca_1d",
             columns=["Attitude towards Russia (CA)", "Attitude towards Russia(SHEEP)"],
            annotate=False, show_regression="lowess", s=5, alpha=0.5,
             cmap=plt.cm.RdBu, d_color=nodes_com_color)

plt.title("(A) SHEEP vs CA")

plt.subplot(1, 3, 2)
s4.plot_two_dim(df_all_embeddings_ca, df_pca_emb_sheep_adj,  flip=None, domains=domains_users,
             top="Russia", suffix1="_0", suffix2="_pca_1d",
             columns=["Attitude towards Russia (CA)", "Attitude towards Russia (SHEEP Null Model)"],
            annotate=False, show_regression="lowess", s=5, alpha=0.5,
             cmap=plt.cm.RdBu, d_color=nodes_com_color)

plt.title("(B) SHEEP Null Model vs CA")

plt.subplot(1, 3, 3)
s4.plot_two_dim(df_pca_emb_sheep, df_pca_emb_sheep_adj,  flip=None, domains=domains_users,
             top="Russia", suffix1="_pca_1d", suffix2="_pca_1d",
             columns=["Attitude towards Russia (SHEEP)", "Attitude towards Russia (SHEEP Null Model)"],
            annotate=False, show_regression="lowess", s=5, alpha=0.5,
             cmap=plt.cm.RdBu, d_color=nodes_com_color)
plt.title("(C) SHEEP Null Model vs SHEEP")

plt.savefig("../figures/sheep-null-model-russia.pdf", bbox_inches="tight")
