In [2]:
import pandas as pd
from itertools import combinations
from scipy.sparse import dok_matrix

# File path
file_path = "../../data/wiki-RfA.txt"

# Read and parse the file
data = []
with open(file_path, "r", encoding="utf-8") as file:
    current_session = None
    for line in file:
        if line.startswith("SRC:"):
            src = line.split("SRC:")[1].strip()
        elif line.startswith("TGT:"):
            tgt = line.split("TGT:")[1].strip()
            # Check if it's a new session (new TGT indicates a new voting session)
            if tgt != current_session:
                current_session = tgt
                session_id = len(data)  # Increment session ID
                data.append({"Session": session_id, "TGT": tgt, "SRCs": []})
        elif line.startswith("VOT:"):
            vot = int(line.split("VOT:")[1].strip())
        elif line.startswith("RES:"):
            res = int(line.split("RES:")[1].strip())
        elif line.startswith("YEA:"):
            yea = int(line.split("YEA:")[1].strip())
        elif line.startswith("DAT:"):
            dat = line.split("DAT:")[1].strip()
        elif line.startswith("TXT:"):
            # Add the SRC to the current session's SRC list
            if data and "SRCs" in data[-1]:
                data[-1]["SRCs"].append(src)

# Create a flat DataFrame
records = []
for session in data:
    for src in session["SRCs"]:
        records.append({"Session": session["Session"], "TGT": session["TGT"], "SRC": src})

df = pd.DataFrame(records)

# Group SRC values by session
grouped = df.groupby("Session")["SRC"].apply(list)

# Create a list of all unique SRCs and a mapping to indices
users = sorted(df["SRC"].unique())
user_to_idx = {user: idx for idx, user in enumerate(users)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}

# Initialize a sparse matrix
n_users = len(users)
co_occurrence = dok_matrix((n_users, n_users), dtype=int)

# Populate the co-occurrence matrix
for user_list in grouped:
    for user1, user2 in combinations(user_list, 2):
        idx1, idx2 = user_to_idx[user1], user_to_idx[user2]
        co_occurrence[idx1, idx2] += 1
        co_occurrence[idx2, idx1] += 1

# Convert the sparse matrix to a DataFrame for display
co_occurrence_df = pd.DataFrame.sparse.from_spmatrix(
    co_occurrence, index=users, columns=users
)

# Display the sparse co-occurrence matrix
print("Sparse Co-occurrence Matrix:")
print(co_occurrence_df)

# Optionally save to a CSV
# co_occurrence_df.to_csv("wiki_rfa_co_occurrence.csv")


Sparse Co-occurrence Matrix:
                         !---slappdash---!  %D0%90  'sed  (.Y.)  (:Julien:)  \
                   5020                  0       9     0      0           0   
!---slappdash---!     0                  0       0     0      0           0   
%D0%90                9                  0       0     0      0           0   
'sed                  0                  0       0     0      0           0   
(.Y.)                 0                  0       0     0      0           0   
...                 ...                ...     ...   ...    ...         ...   
とある白い猫                1                  0       0     0      0           0   
シ                     3                  0       0     0      0           0   
石                     0                  0       0     0      0           0   
龗                     1                  0       0     0      0           0   
완젬스                   0                  0       0     0      0           0   

                   (ae

In [13]:
import pandas as pd

# Assuming `co_occurrence_df` is the co-occurrence DataFrame from the previous step

# Calculate Weighted Participation (sum of co-occurrence values)
weighted_participation = co_occurrence_df.sum(axis=1)

# Combine results into a DataFrame for ranking
power_voters = pd.DataFrame({
    "User": co_occurrence_df.index,
    "Weighted Participation": weighted_participation
})

# Load scores.csv
scores = pd.read_csv("../../data/scores.csv")  # Update the path as needed

# Merge the power voters DataFrame with the scores DataFrame
power_voters = power_voters.merge(scores, left_on="User", right_on="username", how="left")

# Remove rows with missing data (users without total_score or Weighted Participation)
power_voters = power_voters.dropna(subset=["Weighted Participation", "total_score"])

# Sort primarily by Weighted Participation and then by Admin Score (total_score)
power_voters = power_voters.sort_values(
    by=["Weighted Participation", "total_score"],
    ascending=[False, False]
)

# Reset index for better readability
power_voters.reset_index(drop=True, inplace=True)

In [14]:
top_n = 10  # Adjust as needed
print(f"Top {top_n} Power Voters:")
for _, row in power_voters.head(top_n).iterrows():
    user = f'"{row["User"]}"'  # Add quotes around the username
    weighted_participation = row["Weighted Participation"]
    total_score = row["total_score"]
    print(f"User: {user}, Weighted Participation: {weighted_participation}, Admin Score: {total_score}")


Top 10 Power Voters:
User: "Siva1979", Weighted Participation: 89307, Admin Score: 827.0
User: "Acalamari", Weighted Participation: 61987, Admin Score: 1300.0
User: "Stifle", Weighted Participation: 57998, Admin Score: 1297.0
User: "Bearian", Weighted Participation: 56537, Admin Score: 1300.0
User: "Xoloz", Weighted Participation: 53741, Admin Score: 865.0
User: "Newyorkbrad", Weighted Participation: 47998, Admin Score: 1017.0
User: "Axl", Weighted Participation: 47218, Admin Score: 837.0
User: "Malinaccier", Weighted Participation: 47010, Admin Score: 1180.0
User: "Merovingian", Weighted Participation: 46465, Admin Score: 933.0
User: "MONGO", Weighted Participation: 43164, Admin Score: 1039.0


In [15]:
# Optionally save the results to a CSV file
power_voters.to_csv("../../data/power_voters_with_scores.csv", index=False)