# Get Newcomer Data Gathering

Aaron has a labeled dataset of newcomers to English Wikipedia on datasets.wikimedia.org. These newcomers are labeled as good-faith or bad-faith based on Wikipedians' judgements of the newcomers' edits in their first edit session. We will get ORES damaging, goodfaith, and reverted scores for the first n=50 revisions made by each newcomer in the data set. We will use these labeled revisions to build a model for predicting if a newcomer is goodfaith in the "Good Faith Newcomer Prediction" notebook.



In [1]:
% matplotlib inline
import pandas as pd
import urllib
import requests
import os
import concurrent.futures
import time

### Download newcomer quality data

In [2]:
DATA_URL = "https://datasets.wikimedia.org/public-datasets/enwiki/rise-and-decline/newbie_quality_sample.tsv"
_ = urllib.request.urlretrieve(DATA_URL, "newbie_quality_sample.tsv")

In [3]:
labels_df = pd.read_csv("newbie_quality_sample.tsv", sep = '\t')
labels_df.index = labels_df.user_id
labels_df["goodfaith_label"] = labels_df.category > 2 

In [4]:
labels_df["goodfaith_label"].value_counts()

True     875
False    188
Name: goodfaith_label, dtype: int64

In [5]:
labels_df = labels_df.sort_values("user_id")

In [6]:
labels_df.to_csv("../data/newcomer_labels.csv", index = False)

### Score early newcomer contribution history

In [7]:
def get_user_name(userid):
    """
    Query mw api for a username given userid.
    We need this because the mw api takes the username
    and Aaron's data only has the userid
    """
    url = 'http://en.wikipedia.org/w/api.php?action=query&format=json&list=users&usprop=&ususerids=%d'
    try:
        r = requests.get( url % userid).json()
        return r["query"]['users'][0]['name']
    except:
        print("Error getting username: ", userid)
        return None

def get_user_contribs(username, uclimit = 50):
    """
    Get meta data for the first uclimit revisions
    from user username
    """
    params = {'action': 'query',
           'format': 'json',
           'list' : 'usercontribs',
           'uclimit': uclimit,
           'ucdir': 'newer',
           'ucprop': 'ids|timestamp|title',
           'ucuser': username
          }
    try:
        r = requests.get('http://en.wikipedia.org/w/api.php', params=params).json()
        return r["query"]['usercontribs']
    except:
        print("Error getting contribs: ", username)
        return []
    
def get_ores_score(model, revid):
    """
    Get the ORES edit quality score from
    model for revid
    """
    url = 'https://ores.wikimedia.org/v2/scores/enwiki/%s/%d'
    try:
        r = requests.get( url % (model, revid)).json()
        return r["scores"]['enwiki'][model]['scores'][str(revid)]['probability']['true']
    except:
        print("Error getting scores: ", revid)
        return None
    
def get_ores_labeled_contribs(userid, uclimit = 50):
    """
    Given userid, get ORES scores for all three
    edit quality models for the firts uclimit edits made
    by user with userid
    """
        
    start = time.time()
    
    # get user name for contribs API
    uname = get_user_name(userid)
    
    # get contribs 
    contribs = None
    if uname is not None:
        contribs = get_user_contribs(uname, uclimit = uclimit)
        
    
    # get ORES scores for each contrib
    models = ['reverted', 'goodfaith', 'damaging']
    if contribs is not None:
        for model in models:
            for d_rev in contribs:
                d_rev[model] = get_ores_score(model, d_rev['revid'])
                     
    end = time.time()           
    print("Finished User: ", userid, " in ", int(end-start), "seconds")
    
    return contribs

def save_ores_labeled_contribs(userid):
    """
    
    """
    user_file = "../data/%d.csv" % userid
    if not os.path.isfile(user_file):
        contribs = get_ores_labeled_contribs(userid)
        if contribs is not None:
            pd.DataFrame(contribs).to_csv(user_file % userid, index = False)


def thread_function(helper_func, args_list, n_threads = 10):
    with concurrent.futures.ThreadPoolExecutor(n_threads) as executor:
        return list(executor.map(helper_func, args_list))    

In [8]:
_ = thread_function(save_ores_labeled_contribs, list(labels_df.user_id), n_threads = 6)

In [9]:
dfs = []

for uid in labels_df.user_id:
    user_file = "../data/%d.csv" % uid
    try:
        df = pd.read_csv(user_file)
        dfs.append(df)
    except:
        print("Error reading: ", user_file)

revisions_df = pd.concat(dfs).dropna()
revisions_df.to_csv("../data/newcomer_revisions.csv", index = False)

Error reading:  ../data/36282.csv
Error reading:  ../data/54943.csv
Error reading:  ../data/70358.csv
Error reading:  ../data/141314.csv
Error reading:  ../data/219629.csv
Error reading:  ../data/285599.csv
Error reading:  ../data/531275.csv
Error reading:  ../data/559047.csv
Error reading:  ../data/593695.csv
Error reading:  ../data/667147.csv
Error reading:  ../data/679526.csv
Error reading:  ../data/684238.csv
Error reading:  ../data/691240.csv
Error reading:  ../data/717021.csv
Error reading:  ../data/775985.csv
Error reading:  ../data/881279.csv
Error reading:  ../data/1005136.csv
Error reading:  ../data/1153106.csv
Error reading:  ../data/1201426.csv
Error reading:  ../data/1218423.csv
Error reading:  ../data/1223674.csv
Error reading:  ../data/1261094.csv
Error reading:  ../data/1389088.csv
Error reading:  ../data/1591094.csv
Error reading:  ../data/1679226.csv
Error reading:  ../data/1768117.csv
Error reading:  ../data/1956584.csv
Error reading:  ../data/1996887.csv
Error readi