## Use of Data Wrangler extension recommended

In [56]:
import pandas as pd
from itertools import combinations
from scipy.sparse import dok_matrix

# File path
file_path = "../../data/wiki-RfA.txt"

# Read and parse the file
data = []
with open(file_path, "r", encoding="utf-8") as file:
    current_session = None
    for line in file:
        if line.startswith("SRC:"):
            src = line.split("SRC:")[1].strip()
        elif line.startswith("TGT:"):
            tgt = line.split("TGT:")[1].strip()
            # Check if it's a new session (new TGT indicates a new voting session)
            if tgt != current_session:
                current_session = tgt
                session_id = len(data)  # Increment session ID
                data.append({"Session": session_id, "TGT": tgt, "SRCs": []})
        elif line.startswith("VOT:"):
            vot = int(line.split("VOT:")[1].strip())
        elif line.startswith("RES:"):
            res = int(line.split("RES:")[1].strip())
        elif line.startswith("YEA:"):
            yea = int(line.split("YEA:")[1].strip())
        elif line.startswith("DAT:"):
            dat = line.split("DAT:")[1].strip()
        elif line.startswith("TXT:"):
            # Add the SRC to the current session's SRC list
            if data and "SRCs" in data[-1]:
                data[-1]["SRCs"].append(src)

# Create a flat DataFrame
records = []
for session in data:
    for src in session["SRCs"]:
        records.append({"Session": session["Session"], "TGT": session["TGT"], "SRC": src})

df = pd.DataFrame(records)

# Group SRC values by session
grouped = df.groupby("Session")["SRC"].apply(list)

# Create a list of all unique SRCs and a mapping to indices
users = sorted(df["SRC"].unique())
user_to_idx = {user: idx for idx, user in enumerate(users)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}

# Initialize a sparse matrix
n_users = len(users)
co_occurrence = dok_matrix((n_users, n_users), dtype=int)

# Populate the co-occurrence matrix
for user_list in grouped:
    for user1, user2 in combinations(user_list, 2):
        idx1, idx2 = user_to_idx[user1], user_to_idx[user2]
        co_occurrence[idx1, idx2] += 1
        co_occurrence[idx2, idx1] += 1

# Convert the sparse matrix to a DataFrame for display
co_occurrence_df = pd.DataFrame.sparse.from_spmatrix(
    co_occurrence, index=users, columns=users
)


In [57]:
co_occurrence_df

Unnamed: 0,Unnamed: 1,!---slappdash---!,%D0%90,'sed,(.Y.),(:Julien:),(aeropagitica),*Kat*,*Spark*,*drew,...,ברוקולי,כתר,පසිඳු කාවින්ද,බිඟුවා,‎Jetijonez,とある白い猫,シ,石,龗,완젬스
,5020,0,9,0,0,0,282,3,0,30,...,0,0,1,0,1,1,3,0,1,0
!---slappdash---!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
%D0%90,9,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
'sed,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(.Y.),0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
とある白い猫,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
シ,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
石,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
龗,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
print(co_occurrence_df.index.is_unique)

True


In [59]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


# Calculate Participation (sum of co-occurrence values)
participation = co_occurrence_df.sum(axis=1)

# Combine results into a DataFrame for ranking
power_voters = pd.DataFrame({
    "User": co_occurrence_df.index,
    "Participation": participation
})

print(power_voters['User'].is_unique)

# Load scores.csv
scores = pd.read_csv("../../data/scores.csv")  # Update the path as needed

print(scores['username'].is_unique)
scores = scores.drop_duplicates(subset='username', keep='first')
print(scores['username'].is_unique)


# Merge the power voters DataFrame with the scores DataFrame
power_voters = power_voters.merge(scores, left_on="User", right_on="username", how="left")

# Remove rows with missing data (users without total_score or Participation)
power_voters = power_voters.dropna(subset=["Participation", "total_score"])
print(power_voters['User'].is_unique)

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize and calculate Power Score
power_voters['Power Score'] = scaler.fit_transform(power_voters[['Participation']].to_numpy()) + \
                              scaler.fit_transform(power_voters[['total_score']].to_numpy())
power_voters['Power Score'] /= 2

# Sort primarily by Participation and then by Admin Score (total_score)
power_voters = power_voters.sort_values(
    by=['Power Score'],
    ascending=[False]
)

# Reset index for better readability
power_voters.reset_index(drop=True, inplace=True)
power_voters.drop(columns=['username'], inplace=True)

True
False
True
True


In [60]:
power_voters

Unnamed: 0,User,Participation,total_score,Power Score
0,Acalamari,61987,1300.0,0.847043
1,Stifle,57998,1297.0,0.823556
2,Siva1979,89307,827.0,0.818077
3,Bearian,56537,1300.0,0.816530
4,Juliancolton,41846,1277.0,0.725433
...,...,...,...,...
7216,64.230.2.27,62,0.0,0.000342
7217,129.177.19.120,41,0.0,0.000224
7218,128.83.101.111,29,0.0,0.000157
7219,172.162.10.219,25,0.0,0.000134


In [61]:
# Save the results to a CSV file
power_voters.to_csv("../../data/power_voters_with_scores.csv", index=False)

In [62]:
import pandas as pd
from GalPlots import *
from textblob import TextBlob
import nltk
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

global df_sentiment_verification

# Step 1: Parse the dataset
def parse_rfa_file(file_path):
    """
    Reads and parses the wiki-RfA.txt file, returning a DataFrame.

    Args:
        file_path (str): Path to the input file.

    Returns:
        pd.DataFrame: Parsed data as a DataFrame.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")

    with open(file_path, 'r', encoding='utf-8') as file:
        raw_data = file.read()

    entries = raw_data.strip().split('\n\n')
    votes = []

    for entry in entries:
        vote = {}
        lines = entry.strip().split('\n')
        for line in lines:
            if ':' not in line:
                continue
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip().strip("'''").strip('"').strip("'")
            vote[key] = value
        votes.append(vote)

    df = pd.DataFrame(votes)

    numeric_fields = ['VOT', 'RES', 'YEA']
    for field in numeric_fields:
        if field in df.columns:
            df[field] = pd.to_numeric(df[field], errors='coerce')
   
    return df

# Step 2: Filter users with diverse voting history
def filter_users(df, min_years=4):
    """
    Keeps users with votes spanning at least a minimum number of years.

    Args:
        df (pd.DataFrame): Input DataFrame.
        min_years (int): Minimum distinct voting years.

    Returns:
        pd.DataFrame: Filtered data.
    """
    vote_counts = df.groupby('SRC')['YEA'].nunique().reset_index()
    eligible_users = vote_counts[vote_counts['YEA'] >= min_years]['SRC']
    return df[df['SRC'].isin(eligible_users)]

# Step 3: Analyze sentiment of votes
def analyze_sentiment(df):
    """
    Adds a sentiment polarity score for each vote.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: DataFrame with sentiment polarity.
    """
    analyzer = SentimentIntensityAnalyzer()

    def analyze_sentiment_vader(text):
        global df_sentiment_verification
        sentiment_scores = analyzer.polarity_scores(text)
        
        new_row = pd.DataFrame([{'Text': text, 'Sentiment': sentiment_scores['compound']}])
        df_sentiment_verification = pd.concat([df_sentiment_verification, new_row], ignore_index=True)

        return sentiment_scores['compound']

    df = df.copy()  # Create a copy to avoid chained assignment issues
    df['Polarity'] = df['TXT'].apply(analyze_sentiment_vader)
    return df


# Step 4: Calculate correlation metrics
def compute_correlation_ratio(df):
    """
    Computes the correlation between sentiment polarity and vote positivity.

    Args:
        df (pd.DataFrame): DataFrame with sentiment data.

    Returns:
        pd.DataFrame: Aggregated data.
        float: Correlation coefficient.
    """
    vote_aggregation = df.groupby('SRC').agg(
        Positive_Votes=('VOT', lambda x: (x > 0).sum()),
        Negative_Votes=('VOT', lambda x: (x < 0).sum())
    ).reset_index()

    vote_aggregation['Positive_Vote_Ratio'] = vote_aggregation.apply(
        lambda row: row['Positive_Votes'] / (row['Positive_Votes'] + row['Negative_Votes'])
        if (row['Positive_Votes'] + row['Negative_Votes']) > 0 else None,
        axis=1
    )


    vote_aggregation.dropna(subset=['Positive_Vote_Ratio'], inplace=True)

    avg_polarity = df.groupby('SRC')['Polarity'].mean().reset_index().rename(columns={'Polarity': 'Average_Polarity'})
    merged_df = pd.merge(vote_aggregation, avg_polarity, on='SRC')

    correlation = merged_df['Average_Polarity'].corr(merged_df['Positive_Vote_Ratio'])
    return merged_df, correlation




file_path = '../../data/wiki-RfA.txt'

print("Loading and parsing data...")
df = parse_rfa_file(file_path)
print(f"Parsed {len(df)} votes.")

print("Filtering users with sufficient voting history...")
filtered_df = filter_users(df, min_years=4)
print(f"Filtered down to {filtered_df['SRC'].nunique()} users.")

if filtered_df.empty:
    print("No users meet the criteria for analysis.")
    
else:
    print("Analyzing sentiment...")
    filtered_df = analyze_sentiment(filtered_df)

    print("Calculating correlation metrics...")
    merged_df, correlation = compute_correlation_ratio(filtered_df)
    print(f"Correlation coefficient: {correlation:.2f}")




Loading and parsing data...
Parsed 198275 votes.
Filtering users with sufficient voting history...
Filtered down to 1021 users.
Analyzing sentiment...
Calculating correlation metrics...
Correlation coefficient: 0.52


In [63]:
df_sentiment_verification
#Conclusion: seems to work reasonably well, although not perfect.

Unnamed: 0,Text,Sentiment
0,Support''' There are plenty of important roles...,0.7083
1,Support''' barring a completely unexpected ans...,0.9801
2,Support''' because I see no good reason not to.,0.0757
3,Support''' if a qualified editor is asking for...,0.7096
4,Support''' From his hard work in copyediting t...,0.8883
...,...,...
186073,Support. --,0.4019
186074,Support. Didn't have enough information to su...,0.6694
186075,Support,0.4019
186076,Support. --,0.4019


In [64]:
final_df = pd.merge(
    power_voters, 
    merged_df.drop(columns=['Positive_Votes', 'Negative_Votes']), 
    left_on='User', 
    right_on='SRC', 
    how='inner'
)
final_df = final_df.drop(columns=['SRC'])

#Normally unnecessary but for precaution
final_df = final_df.sort_values(
    by=['Power Score'],
    ascending=[False]
)
final_df = final_df.reset_index(drop=True)


In [65]:
final_df

Unnamed: 0,User,Participation,total_score,Power Score,Positive_Vote_Ratio,Average_Polarity
0,Acalamari,61987,1300.0,0.847043,0.976356,0.505817
1,Stifle,57998,1297.0,0.823556,0.622921,0.091755
2,Siva1979,89307,827.0,0.818077,0.850085,0.451205
3,Bearian,56537,1300.0,0.816530,0.907143,0.491877
4,Juliancolton,41846,1277.0,0.725433,0.787879,0.364920
...,...,...,...,...,...,...
667,Amatulic,1672,106.0,0.050125,0.636364,0.403646
668,Katr67,399,116.0,0.046844,0.714286,0.510043
669,Dougweller,1077,106.0,0.046793,0.636364,0.570755
670,Ctjf83,1114,103.0,0.045847,0.555556,0.417927


In [66]:
final_df.drop(columns=['User','Participation','total_score']).corr()

Unnamed: 0,Power Score,Positive_Vote_Ratio,Average_Polarity
Power Score,1.0,0.071424,-0.007625
Positive_Vote_Ratio,0.071424,1.0,0.55162
Average_Polarity,-0.007625,0.55162,1.0


In [67]:

# Plot 1: Positive Vote Ratio vs. Average Polarity
fig1 = px.scatter(
    final_df,
    x="Average_Polarity",
    y="Positive_Vote_Ratio",
    trendline="ols",
    title="Sentiment vs. Positive Vote Ratio",
    labels={"Average_Polarity": "Average Sentiment Polarity (-1: Negative, +1: Positive)",
            "Positive_Vote_Ratio": "Positive Vote Ratio (0: All Negative, 1: All Positive)"},
    hover_data=["User"]
)
fig1.update_traces(marker=dict(size=8, color="blue"), selector=dict(mode='markers'))
fig1.update_traces(line=dict(color="red"), selector=dict(mode='lines'))  # Red trendline
fig1.show()

with open("../../docs/_includes/plots/pvr_ap.html", "w") as f:
    f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn'))

# Plot 2: Average Polarity vs. Power Score
fig2 = px.scatter(
    final_df,
    x="Average_Polarity",
    y="Power Score",
    trendline="ols",
    title="Sentiment vs. Power Score",
    labels={"Average_Polarity": "Average Sentiment Polarity (-1: Negative, +1: Positive)",
            "Power Score": "Power Score"},
    hover_data=["User"],
)
fig2.update_traces(marker=dict(size=8, color="green"), selector=dict(mode='markers'))
fig2.update_traces(line=dict(color="purple"), selector=dict(mode='lines'))  # Purple trendline
fig2.show()

with open("../../docs/_includes/plots/ps_ap.html", "w") as f:
    f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn'))

# Plot 3: Positive Vote Ratio vs. Power Score
fig3 = px.scatter(
    final_df,
    x="Power Score",
    y="Positive_Vote_Ratio",
    trendline="ols",
    title="Power Score vs. Positive Vote Ratio",
    labels={"Power Score": "Power Score",
            "Positive_Vote_Ratio": "Positive Vote Ratio (0: All Negative, 1: All Positive)"},
    hover_data=["User"]
)
fig3.update_traces(marker=dict(size=8, color="orange"), selector=dict(mode='markers'))
fig3.update_traces(line=dict(color="blue"), selector=dict(mode='lines'))  # Blue trendline
fig3.show()

with open("../../docs/_includes/plots/pvr_ps.html", "w") as f:
    f.write(fig3.to_html(full_html=False, include_plotlyjs='cdn'))
