In [1]:
cd ..

/Users/amiyaguchi/wikipedia-retention


In [2]:
from pyspark.sql import SparkSession, functions as F

spark = SparkSession.builder.getOrCreate()

In [3]:
snapshot = spark.read.csv(
    "data/processed/2007-1-user-network-v3.csv",
    sep='\t',
    schema="src INT, dst INT, weight INT"
)

In [4]:
import pandas as pd

roleG = pd.read_csv("data/processed/roles/2007-1-nmf-G.csv")
roleG.head()

Unnamed: 0,1,0,1.1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,539223,484.283468,1667.401144,0.0,2461.202676,0.0,0.0009818808,2501.258566,0.0,0.0,0.0,221.465514,722.726328,0.0,5.462993e-12,0.0,0.0
1,1763181,787.890693,1637.814231,0.0,836.992552,0.0,2.166074e-08,1270.925027,0.0,0.0,429.834227,439.724518,2.459119,0.0,5.963829e-08,0.0,0.0
2,801279,2949.869317,1308.1853,0.0,0.0,0.0,97.25508,81.469747,362.0337,293.685155,384.428207,260.73344,0.0,0.0,4.299851e-05,0.0,0.0
3,1538132,833.037423,1581.649669,0.0,602.264808,0.0,0.0,1109.625973,4.779822e-17,243.296308,182.271166,423.575266,1.439515,232.76051,0.0,646.237003,0.0
4,16168,548.378772,1582.472685,0.0,2242.648154,0.0,0.217531,2858.066792,6.902702e-35,344.935846,353.149723,58.826487,0.038381,0.0,0.0001733111,560.72011,23.23672


In [97]:
roleG.sum()[1:].tolist()

[656893420.8306816,
 443078715.6885329,
 181042555.45666343,
 15678726.686227245,
 57583900.54039875,
 76453937.72426715,
 21178923.422441352,
 44566394.67794203,
 52426182.64425079,
 76548019.66653797,
 22541214.434084028,
 37060870.040500335,
 50220983.4150533,
 22989849.323926896,
 48813985.29541604,
 48328388.038031854]

In [42]:
from pyspark.sql import Row

x = roleG.apply(lambda x: Row(
    user_id=int(x[0]), 
    vec=x[1:].astype(float).tolist()
), axis=1).values

rolx_df = spark.createDataFrame(list(x))

In [43]:
rolx_df.show()

+-------+--------------------+
|user_id|                 vec|
+-------+--------------------+
| 539223|[484.283467546698...|
|1763181|[787.890693000982...|
| 801279|[2949.86931714851...|
|1538132|[833.037423332417...|
|  16168|[548.378771838368...|
| 643450|[580.260310467623...|
| 799415|[420.275031112874...|
|1381068|[545.184834329284...|
|1613101|[381.471759631519...|
|3134723|[362.520644362796...|
|  15126|[619.971114535132...|
|  82432|[199.047176867927...|
| 506179|[159.682027369028...|
|2852297|[1280.19528441979...|
| 146986|[229.480719223374...|
| 186131|[1005.31390198585...|
|  97190|[710.492212005573...|
|1224427|[318.161185644503...|
|  84417|[298.464526370839...|
| 115732|[1079.96298984609...|
+-------+--------------------+
only showing top 20 rows



In [90]:
import numpy as np
from pyspark.sql import types as T

# src only needs to be the set of new users
edgelist = (
    snapshot
    .union(snapshot.selectExpr("dst as src", "src as dst", "weight"))
    .distinct()
)

@F.udf(T.ArrayType(T.FloatType()))
def norm_weight(weight, vec):
    a = np.array(vec)
    return (a/a.sum()*weight).astype(float).tolist()

@F.udf(T.ArrayType(T.FloatType()))
def average_vec(vecs):
    avg = np.array(vecs).sum(axis=0)/len(vecs)
    return avg.astype(float).tolist()

@F.udf()
def to_csv(src, *vecs):
    vec = sum(vecs, [])
    return str(src) + "\t" + "\t".join([str(x) for x in vec])

averaged = (
    edgelist
    .join(rolx_df, on=edgelist.dst==rolx_df.user_id)
    .drop("user_id")
    .withColumn("vec", norm_weight("weight", "vec"))
    .groupby("src")
    .agg(F.collect_list("vec").alias("vec_list"))
    .select("src", average_vec("vec_list").alias("vec"))
    .select(to_csv("src", "vec"))
)

(
    averaged
    .repartition(1)
    .write.csv(
        "data/processed/roles/2007-1-averaged", 
        header=False, 
        mode="overwrite")
)

In [121]:
enwiki = spark.read.parquet("data/processed/enwiki-meta-compact")
user_text = (
    enwiki
    .where("year=2007 and quarter=1")
    .groupby("user_id")
    .agg(F.expr("sum(log(textdata+1)) as edit_count"))
)

In [115]:
roleG.sum()[1:].tolist()

@F.udf(T.ArrayType(T.FloatType()))
def average_contribution(total, self, n):
    a = np.array(self)
    without_self = np.array(total) - a
    weighted_avg = without_self/(n-1)*(a/a.sum())
    return weighted_avg.tolist()

@F.udf()
def to_csv(src, *vecs):
    vec = sum(vecs, [])
    return str(src) + "\t" + "\t".join([str(x) for x in vec])

total_vec = F.array(list(map(F.lit, roleG.sum()[1:].astype(float))))

averaged = (
    edgelist
    .limit(100)
    .join(rolx_df.selectExpr("user_id as dst", "vec"), on="dst")
    #.withColumn("vec", norm_weight("weight", "vec"))
    .groupby("src")
    .agg(F.collect_list("vec").alias("vec_list"))
    .withColumn("neighborhood_avg", average_vec("vec_list"))
    #
    .join(rolx_df.selectExpr("user_id as src", "vec as user_vec"), on="src")
    .withColumn("total", total_vec)
    .withColumn("total_avg", average_contribution("total", "user_vec", F.lit(roleG.shape[0])))
    .select(to_csv("src", "user_vec", "neighborhood_avg", "total_avg"))
)

(
    averaged
    .repartition(1)
    .write.csv(
        "data/processed/roles/2007-1-averaged", 
        header=False, 
        mode="overwrite")
)

In [116]:
! mv data/processed/roles/2007-1-averaged/*.csv data/processed/roles/2007-1-averaged.csv

In [117]:
! rm -r data/processed/roles/2007-1-averaged/

In [118]:
! head -n2 data/processed/roles/2007-1-averaged.csv

1848815	721.1487944577817	1565.1254656825863	0.0	638.7963725258836	0.0	0.012248734005855662	998.1587474975853	2.309059539013848e-17	342.5567122421984	256.9569142020714	217.99676527954952	0.10622208861539033	113.35707452920236	9.083574128738857e-06	604.2349922580771	0.0	391.2308044433594	1548.2999267578125	0.0	4622.04833984375	0.0	4.3153493767168316e-32	4539.908203125	0.0	171.63815307617188	0.0	0.0	0.0	0.0	1.7009691784248663e-23	450.11517333984375	842.1399536132812	253.41065979003906	370.9659729003906	0.0	5.357486724853516	0.0	0.0005009525921195745	11.308073043823242	5.504884391600985e-19	9.606904029846191	10.522002220153809	2.6286256313323975	0.002105891704559326	3.045358419418335	1.1171164970846803e-07	15.77794075012207	0.0
64406	4397.854096949706	999.5221030401756	0.0	0.0	227.19271989601867	0.0	0.0	1035.2808406462607	0.0	233.69645261853745	0.0	462.90096153315585	0.06170855727049063	0.0	0.0	1204.972045502596	3155.646240234375	1356.6268310546875	0.0	0.0	563.840087890625	0.0	0.0	713.23