In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import mwparserfromhell

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading Data

And correct some date inconcistencies

### From early version of Emma's code

In [2]:
# Read the file into a list of lines
with open('wiki-RfA.txt', 'r') as file:
    lines = file.readlines()

# Create a list of dictionaries, where each dictionary represents a record
df = []
current_entry = {}

# Iterate through each line, current_entry = one log entry with all columns, df = list of all votee/voter pairs
for line in lines:
    line = line.strip()
    if line:
        key, value = line.split(':', 1)
        current_entry[key] = value
    else:
        df.append(current_entry)
        current_entry = {}

# Append  last record
if current_entry:
    df.append(current_entry)

# Convert into DataFrame and store in csv
df = pd.DataFrame(df)
df.columns = ['Source', 'Target', 'Vote', 'Results', 'Year', 'Date', 'Comment']

df.to_csv('wiki-RfA.csv')

# Set Nan values
# replace field that's entirely space (or empty) with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# replace inconsistent date
df['Date'] = df['Date'].str.replace('Julu ', 'July ')
df['Date'] = df['Date'].str.replace('Janry ', 'January ')
df['Date'] = df['Date'].str.replace('Mya ', 'May ')
df['Date'] = df['Date'].str.replace('Jan ', 'January ')
df['Date'] = df['Date'].str.replace('Feb ', 'February ')
df['Date'] = df['Date'].str.replace('Mar ', 'March ')
df['Date'] = df['Date'].str.replace('Apr ', 'April ')
df['Date'] = df['Date'].str.replace('Jun ', 'June ')
df['Date'] = df['Date'].str.replace('Jul ', 'July ')
df['Date'] = df['Date'].str.replace('Aug ', 'August ')
df['Date'] = df['Date'].str.replace('Sep ', 'September ')
df['Date'] = df['Date'].str.replace('Oct ', 'October ')
df['Date'] = df['Date'].str.replace('Nov ', 'November ')
df['Date'] = df['Date'].str.replace('Dec ', 'December ')

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%H:%M, %d %B %Y', errors='coerce')

### Adds to do (OLD)

In [3]:
df.Year = df.Year.astype('Int64')
df.Comment = df.Comment.astype(str)

# Information Extraction from Comments

First create a parsed column with only text infromation from Wikitext (markup language) 

In [4]:
df["Parsed_Comment"] = df.Comment.apply(lambda x: mwparserfromhell.parse(x).strip_code())

In [5]:
def tf_idf_matrix(comments):
    # Create the TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english', max_features= 5000)
    tfidf_matrix = vectorizer.fit_transform(comments.values())
    return tfidf_matrix

In [6]:
def get_idx_lower_bound(keys):
    keys = [item[1] for item in keys]
    lower = min(keys)
    return lower

def get_idx_upper_bound(keys):
    keys = [item[1] for item in keys]
    upper = max(keys)
    return upper

In [7]:
#Group by Target
grouped_by_target = df.groupby('Target').apply(lambda x: list((enumerate(zip(x['Source'], x['Parsed_Comment'])))))
grouped_by_target = grouped_by_target.apply(lambda x: [(item[1][0], item[0], item[1][1]) for item in x]).reset_index()
grouped_by_target = grouped_by_target.rename(columns={0:'Text'})

#Build UID for all pairs of (SRC, Comment)
global_index = 0
for df_index, row in grouped_by_target.iterrows():
    length_of_current_list = len(row.Text)
    mrange = range(global_index, global_index+length_of_current_list)
    global_index += length_of_current_list
    new_list = [(item[0], idx, item[2]) for item, idx in zip(row.Text, mrange)]
    grouped_by_target.at[df_index, 'Text'] = new_list

#Format to have dictionary to be sure to access things in the right order
grouped_by_target.Text = grouped_by_target.Text.apply(lambda x: {(item[0], item[1]):item[2] for item in x})

#Extracting and flattening of the comments with UID 
comments_list = [row.Text for _, row in  grouped_by_target.iterrows()]
comments_dict = {}
for d in comments_list:
    comments_dict.update(d)

#Compute the tfidf coefficient building the vectors
tfidf_m_sparse = tf_idf_matrix(comments_dict)

#Reformat the output to be normaly indexable (no need to optimize with sparse matrix)
tfidf_m =  tfidf_m_sparse.todense()

#Match the tfidf vectors to corresponding vectors
grouped_by_target['tfidf_matrix'] = grouped_by_target.Text.apply(lambda x: tfidf_m[get_idx_lower_bound(x.keys()):get_idx_upper_bound(x.keys())+1])

#Compute the cosinus similarity
new_rows = []
for index_df, row in grouped_by_target.iterrows():
    target_tfidf_dense_matrix = np.asarray(row.tfidf_matrix)
    target_cos_sim = cosine_similarity(target_tfidf_dense_matrix, target_tfidf_dense_matrix)
    new_rows.append(target_cos_sim)
grouped_by_target["cosinus_similarity"] = new_rows

In [9]:
grouped_by_target.to_csv("CosinusSimByTarget.csv", index=False)

# Work in progress

In [None]:
cosinus_sims = []
for i in range(25*7):
    cosinus_sims.append(np.load(f"working_dir/cosine_sim_{i}_batch.npy"))
    len(cosinus_sims)

In [9]:
df.iloc[176233].Comment

'Looks good.  --'

In [None]:
max_year = df['Year'].max()
min_year = df['Year'].min()
mean_year = df['Year'].mean()

print(f"max_year: {max_year}\n min_year: {min_year}\n mean_year: {mean_year}")

In [None]:
## Obsolete

df.Comment = df.Comment.astype('str')

def getting_tags_number(comment):
    if "'''" in comment:
        return len(comment.split("'''")) - 2
    else:
        return 0

def getting_tags_list(comment):
    if "'''" in comment[:4]:
        text = comment.split("'''")[1]
        if len(text.split(" ")) > 5:
            return text
        else:   
            return ""
    else:
        return ""

df["CommentTags"] = df.Comment.apply(getting_tags_list)
df["NumberCommentTag"] = df.Comment.apply(getting_tags_number)

for t in df.CommentTags.unique()[250:270]:
    print(t)