In [1]:
! ls ../data/processed

2007-1-enwiki-projection-user.csv
2007-1-enwiki-projection-user-roles.csv
2007-2-enwiki-projection-user.csv
2007-2-enwiki-projection-user-roles.csv
2007-3-enwiki-projection-user.csv
2007-3-enwiki-projection-user-roles.csv
2007-4-enwiki-projection-user.csv
2007-4-enwiki-projection-user-roles.csv
all_article_features.csv
all_user_features.csv
enwiki-meta-compact
enwiki-meta-parquet
kcore-2007-1.csv


In [2]:
import pandas as pd

with open("../data/raw/admin.txt", 'r') as f:
    data = f.readlines()

In [3]:
admins = [x.split("User:")[-1].strip() for x in data if data]
len(admins)

997

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("../data/processed/enwiki-meta-compact")

df.select("username").distinct().count()

11058849

In [5]:
admin_df = spark.createDataFrame([{'admin': admin} for admin in admins])



In [6]:
df.select("username").distinct().join(admin_df, admin_df.admin == df.username, "inner").count()

714

In [7]:
sample = df.where("year=2007 and quarter=1")
sample_admins = sample.join(admin_df, admin_df.admin == df.username, "inner")

print(sample.count(), sample_admins.count())

(13740491, 335535)


In [22]:
df.where("username like '%bot%'").select("username").distinct().show(n=5)

+------------+
|    username|
+------------+
|    Turbothy|
|    Darkboth|
|Autobotm8rix|
|   Gilbotron|
| Philbot5000|
+------------+
only showing top 5 rows



In [8]:
sample_admins.printSchema()

root
 |-- article_id: integer (nullable = true)
 |-- rev_id: integer (nullable = true)
 |-- article_title: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- username: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- textdata: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- admin: string (nullable = true)



In [9]:
sample_admins.select("user_id").distinct().count()

546

In [12]:
sample_admins.where("username like '%bot%'").select("user_id").distinct().count()

0

In [10]:
from pyspark.sql import functions as F

(
    sample_admins
    .groupBy("user_id")
    .agg(
        F.countDistinct("article_id").alias("n_articles"),
        F.expr("cast(sum(log(textdata)) as int)")
    )
    .orderBy(F.desc("n_articles"))
    .show()
)

+-------+----------+-------------------------------+
|user_id|n_articles|CAST(sum(log(textdata)) AS INT)|
+-------+----------+-------------------------------+
| 509520|      9897|                          63662|
| 379407|      5168|                          35748|
| 296765|      4783|                          46719|
|1089346|      4205|                          21210|
| 429249|      3406|                          33826|
| 659090|      3161|                          21070|
| 319061|      2816|                          26194|
| 271376|      2773|                          14598|
|  97078|      2693|                          20160|
|   7402|      2589|                          31334|
| 921400|      2469|                          23864|
| 657950|      2415|                          40352|
| 889851|      2352|                          37896|
|  59986|      2267|                          18601|
|  44020|      2263|                          24403|
| 266416|      2122|                          

In [41]:
def stats(df):
    (
        df.groupBy("user_id")
        .agg(
            F.expr("count(distinct article_id) as n_articles"),
            F.expr("sum(log(textdata)) as sumlog_textdata")
        )
        .selectExpr(
            "avg(n_articles) as avg_n_articles", 
            "avg(sumlog_textdata) as avg_sumlog_textdata",
            "stddev_pop(n_articles) as std_n_articles",
            "stddev_pop(sumlog_textdata) as sumlog_textdata",
            "count(*) as n_samples"
        )
    ).show()

stats(sample)
stats(sample_admins)
stats(sample.where("username like '%bot%'"))

+-----------------+-------------------+------------------+------------------+---------+
|   avg_n_articles|avg_sumlog_textdata|    std_n_articles|   sumlog_textdata|n_samples|
+-----------------+-------------------+------------------+------------------+---------+
|4.140579194386872|  51.25286066062268|147.13882687455467|1114.4734118786153|  1820874|
+-----------------+-------------------+------------------+------------------+---------+

+-----------------+-------------------+-----------------+-----------------+---------+
|   avg_n_articles|avg_sumlog_textdata|   std_n_articles|  sumlog_textdata|n_samples|
+-----------------+-------------------+-----------------+-----------------+---------+
|367.3369963369963|    4175.1472832393|737.1397698399015|6777.760328942174|      546|
+-----------------+-------------------+-----------------+-----------------+---------+

+-----------------+-------------------+-----------------+-----------------+---------+
|   avg_n_articles|avg_sumlog_textdata|   