In [20]:
cd ..

/Users/amiyaguchi/wikipedia-retention


In [21]:
import pandas as pd

with open("data/raw/admins.txt", 'r') as f:
    data = f.readlines()

admins = [x.split("User:")[-1].strip() for x in data if x.strip()]
len(admins)

997

In [22]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("data/processed/enwiki-meta-compact")

df.select("username").distinct().count()

11058849

In [44]:
admin_df = spark.createDataFrame([{'admin': admin} for admin in admins])

In [24]:
df.select("username").distinct().join(admin_df, admin_df.admin == df.username, "inner").count()

716

In [25]:
sample = df.where("year=2007 and quarter=1")
sample_admins = sample.join(admin_df, admin_df.admin == df.username, "inner")

print(sample.count(), sample_admins.count())

13740491 336535


In [26]:
df.where("username like '%bot%'").select("username").distinct().show(n=5)

+-----------+
|   username|
+-----------+
|   Turbothy|
|   Mummybot|
|  Reaverbot|
|  Gilbotron|
|Philbot5000|
+-----------+
only showing top 5 rows



In [27]:
sample_admins.printSchema()

root
 |-- article_id: integer (nullable = true)
 |-- rev_id: integer (nullable = true)
 |-- article_title: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- username: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- minor: boolean (nullable = true)
 |-- textdata: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- admin: string (nullable = true)



In [28]:
sample_admins.select("user_id").distinct().count()

548

In [29]:
sample_admins.where("username like '%bot%'").select("user_id").distinct().count()

0

In [30]:
from pyspark.sql import functions as F

(
    sample_admins
    .groupBy("user_id")
    .agg(
        F.countDistinct("article_id").alias("n_articles"),
        F.expr("cast(sum(log(textdata)) as int)")
    )
    .orderBy(F.desc("n_articles"))
    .show(n=5)
)

+-------+----------+-------------------------------+
|user_id|n_articles|CAST(sum(log(textdata)) AS INT)|
+-------+----------+-------------------------------+
| 509520|      9897|                          63662|
| 379407|      5168|                          35748|
| 296765|      4783|                          46719|
|1089346|      4205|                          21210|
| 429249|      3406|                          33826|
+-------+----------+-------------------------------+
only showing top 5 rows



In [31]:
def stats(df):
    (
        df.groupBy("user_id")
        .agg(
            F.expr("count(distinct article_id) as n_articles"),
            F.expr("sum(log(textdata)) as sumlog_textdata")
        )
        .selectExpr(
            "avg(n_articles) as avg_n_articles", 
            "avg(sumlog_textdata) as avg_sumlog_textdata",
            "stddev_pop(n_articles) as std_n_articles",
            "stddev_pop(sumlog_textdata) as sumlog_textdata",
            "count(*) as n_samples"
        )
    ).show(truncate=False, vertical=True)

stats(sample)
stats(sample_admins)
stats(sample.where("username like '%bot%'"))

-RECORD 0--------------------------------
 avg_n_articles      | 4.140579194386872 
 avg_sumlog_textdata | 51.25286066062268 
 std_n_articles      | 147.1388268745544 
 sumlog_textdata     | 1114.473411878615 
 n_samples           | 1820874           

-RECORD 0---------------------------------
 avg_n_articles      | 366.74817518248176 
 avg_sumlog_textdata | 4173.4952657453705 
 std_n_articles      | 735.9015763095758  
 sumlog_textdata     | 6766.63074421905   
 n_samples           | 548                

-RECORD 0---------------------------------
 avg_n_articles      | 818.3851508120649  
 avg_sumlog_textdata | 5673.566176088171  
 std_n_articles      | 5732.564404330988  
 sumlog_textdata     | 40603.633061714536 
 n_samples           | 431                



In [47]:
import shutil
import glob

name = "admins"
interim_path = "data/interim/{}".format(name)
admin_df.repartition(1).write.csv(interim_path, mode="overwrite")
interim_file = glob.glob("{}/*.csv".format(interim_path))[0]
processed_file = "data/processed/{}.csv".format(name)
shutil.copy(interim_file, processed_file)
shutil.rmtree(interim_path)