In [114]:
'''
    Simple analysis of revision data.

    Written by Jackie Chan.
'''

import pandas as pd
import os
import sys
import datetime as dt
import matplotlib.pyplot as pyplot

from static_helpers import *

titles = get_titles()
titles_plus_talk = add_talk_pages(titles)
file_names = [title.replace(":", "-") + ".csv" for title in titles_plus_talk]

file_names.remove('Death of Luo Changqing.csv')
file_names.remove('Talk-Death of Luo Changqing.csv')

directory = "10 Year Revision Data"
df_dict = {}
desc_col = ['num_editors', 'num_anon_editors', 'num_reg_editors', 'num_edits',
    'average_edit_size']
desc = pd.DataFrame(columns=desc_col)

for file in file_names:
    complete_path = os.path.join(directory, file)

    if file_exists(complete_path):
        df_dict[file] = pd.read_csv(complete_path)

        df_dict[file]['pythontime'] = pd.to_datetime(df_dict[file]['timestamp'])
        df_dict[file].sort_values(by='pythontime', inplace=True)

if df_dict != {}:
    print("Dataframe dictionary completed successfully.")

desc['titles'] = file_names

Dataframe dictionary completed successfully.


In [115]:

# print(df_dict['12 June 2019 Hong Kong protest.csv']['user'].nunique())

for i, row in desc.iterrows():

    file_df = df_dict[row['titles']]
    # print(row['titles'])

    # Calculating the number of unique editors.
    desc.at[i, 'num_editors'] = file_df['user'].nunique()

    # Calculating the number of anonymous editors.
    # Articles with no anonymous editors do not contain an 'anon' column.
    if 'anon' in file_df.columns:
        desc.at[i, 'num_anon_editors'] = \
            file_df[file_df['anon'] == True]['user'].nunique()
    else:
        desc.at[i, 'num_anon_editors'] = 0

    # Calculating the number of registered editors.
    if 'anon' in file_df.columns:
        desc.at[i, 'num_reg_editors'] = \
            file_df[file_df['anon'] != True]['user'].nunique()
    else: # If 'anon' column does not exist, then all users are registered.
        desc.at[i, 'num_reg_editors'] = file_df['user'].nunique()

    # Sanity check for anonymous and registered users.
    assert(desc.at[i, 'num_editors'] == \
        desc.at[i, 'num_anon_editors'] + description.at[i, 'num_reg_editors'])

    # Calculating number of edits.
    desc.at[i, 'num_edits'] = file_df.shape[0]

    # Calculating average edit size.
    desc.at[i, 'average_edit_size'] = file_df['size'].mean()

    # Calculating median edit size.
    desc.at[i, 'median_edit_size'] = file_df['size'].median()

print(desc[['titles', 'num_anon_editors']])

titles num_anon_editors
0                      2019–20 Hong Kong protests.csv              399
1                                       Hong Kong.csv              150
2                                      Carrie Lam.csv              154
3                 2019 Hong Kong extradition bill.csv               54
4                  2019 Hong Kong local elections.csv               40
..                                                ...              ...
73  Talk-List of November 2019 Hong Kong protests.csv                1
74  Talk-List of December 2019 Hong Kong protests.csv                0
75                        Talk-Glory to Hong Kong.csv                1
76                   Talk-Lennon Wall (Hong Kong).csv                3
77                                Talk-HKmap.live.csv                1

[78 rows x 2 columns]


In [125]:
all_df = pd.read_csv("10 Year Updated.csv")

all_desc = pd.DataFrame(columns=desc_col)

all_desc[0, 'num_editors'] = all_df['user'].nunique()

all_desc[0, 'num_anon_editors'] = all_df[all_df['anon'] == True]['user'].nunique()

all_desc[0, 'num_reg_editors'] = all_df[all_df['anon'] != True]['user'].nunique()

assert(all_desc[0, 'num_editors'] == \
    all_desc[0, 'num_anon_editors'] + all_desc[0, 'num_reg_editors'])

all_desc[0, 'num_edits'] = all_df.shape[0]

all_desc[0, 'average_edit_size'] = all_df['size'].mean()

all_desc[0, 'median_edit_size'] = all_df['size'].median()

print(all_desc)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [116]:
# Export to CSV
desc.to_csv("desc_page.csv", encoding="utf-8")