In [18]:
import os, time, datetime, csv
import pprint
from collections import defaultdict, Counter
import pandas as pd
import matplotlib.pyplot as plt   # Matplotlib for plotting
from dateutil import parser

languages = ["fr","es","ru","pt","pl","nl","fa","he","ar","sv","cs","hu","fi","ro","et","tr"]
cutoff_date_wiki = 20171101000000
cutoff_date_dt = parser.parse('2017-11-01 00:00:00')

# Create Per-Language Dataset of Users Who Hadn't Received Thanks or Love by Nov 2017
The purpose of this script is to create a per-language dataset of users who hadn't received thanks or love by Nov 2017. We do this by taking a dataset of all historical thanks or love per language, look at all thanks/love that had been received by the end of October 2017, and then create the subset of user ids that made at least one revision in November 2017 that weren't in that dataset, per language.

This project relies on the following PAWS scripts:
* [Querying Unique Editors for a Given Month in Wikipedia](https://paws.wmflabs.org/paws/user/Rubberpaw/notebooks/Accessing%20A%20Month%20of%20Unique%20Editors%20from%20a%20Wikipedia.ipynb#)
* Max's code...

### Load Per-Language Datset of Editors

In [95]:
## SET UP THE DICT OF EDITORS
nov_editors = {}
for lang in languages:
    nov_editors[lang]  = {}
    
rows_processed = 0
rows_discarded = 0    

with open("data/unique_editors_by_language_nov-2017.csv") as f:
    for row in csv.DictReader(f):
        language = row['language'].replace("wiki_p", "")
        row['newcomer'] = False
        if(language in languages):
            try:
                if int(row['user_registration'].replace("b'","").replace("'",""))>cutoff_date_wiki:
                    row['newcomer'] = True
            except:
                pass
            nov_editors[language][row['rev_user']] = row
            rows_processed += 1
        else:
            rows_discarded += 1
print("Processed {0} rows, discarded {1} rows".format(rows_processed, rows_discarded))

Processed 81343 rows, discarded 0 rows


### Load Per-Language Dataset of Thanks and Love

In [97]:
thanks_dataset_sent = {}
thanks_dataset_received = {}
love_dataset_sent = {}
love_dataset_received = {}
for lang in languages:
    thanks_dataset_sent[lang] = set()
    thanks_dataset_received[lang] = set()
    love_dataset_sent[lang] = set()
    love_dataset_received[lang] = set()

rows_processed = 0
rows_discarded = 0
languages_processed = set()
    
with open("data/gratitude_20180629.csv") as f:
    for row in csv.DictReader(f):
        language = row['lang']
        languages_processed.add(language)
        if(language in languages):
            if(parser.parse(row['timestamp'])>= cutoff_date_dt):
                continue
            if(row['thanklove']=='thank'):
                thanks_dataset_sent[language].add(row['sender_id'])
                thanks_dataset_received[language].add(row['receiver_id'])
            else:
                love_dataset_sent[language].add(row['sender_id'])
                love_dataset_received[language].add(row['receiver_id'])
            rows_processed += 1
        else:
            rows_discarded += 1
print("Processed {0} rows, discarded {1} rows.".format(rows_processed, rows_discarded))
print("Languages processed: {0}".format(languages_processed))

Processed 557258 rows, discarded 0 rows.
Languages processed: {'es', 'ar', 'pt', 'tr', 'hu', 'sv', 'fa', 'he'}


### Create a list of November accounts eligible for receiving thanks or love

In [98]:
eligible_accounts = {}
editors_processed = 0
for lang in list(languages_processed):
    eligible_accounts[lang] = {'love_not_sent':[],
                               'love_not_sent_newcomers':[],
                               'love_not_received':[],
                               'love_not_received_newcomers':[],
                               'thanks_not_sent':[],
                               'thanks_not_sent_newcomers':[],
                               'thanks_not_received':[],
                               'thanks_not_received_newcomers':[]}
    
    for key, row in nov_editors[lang].items():
        if key not in thanks_dataset_sent[lang]:
            eligible_accounts[lang]['thanks_not_sent'].append(row)
            if(row['newcomer']):
                eligible_accounts[lang]['thanks_not_sent_newcomers'].append(row)
        if key not in thanks_dataset_received[lang]:
            eligible_accounts[lang]['thanks_not_received'].append(row)
            if(row['newcomer']):
                eligible_accounts[lang]['thanks_not_received_newcomers'].append(row)
        if key not in love_dataset_sent[lang]:
            eligible_accounts[lang]['love_not_sent'].append(row)
            if(row['newcomer']):
                eligible_accounts[lang]['love_not_sent_newcomers'].append(row)
        if key not in love_dataset_received[lang]:
            eligible_accounts[lang]['love_not_received'].append(row)
            if(row['newcomer']):
                eligible_accounts[lang]['love_not_received_newcomers'].append(row)
        editors_processed += 1
    
print("{0} editors processed".format(editors_processed))

39404 editors processed


### Create a single dataframe with the sample for all available languages

In [99]:
all_language_thanks_love = []
for lang in list(languages_processed):
    row = {"lang": lang}
    for key in eligible_accounts[lang].keys():
        row[key] = len(eligible_accounts[lang][key])
    row['total_editors'] = len(nov_editors[lang])
    all_language_thanks_love.append(row)

In [100]:
pd.DataFrame(all_language_thanks_love).to_csv("data/gratitude_study_eligible_account_counts-Nov-2017.csv")

In [101]:
#Counter([x['newcomer'] for x in nov_editors[lang].values()])
all_language_thanks_love

[{'lang': 'es',
  'love_not_received': 16563,
  'love_not_received_newcomers': 7253,
  'love_not_sent': 16751,
  'love_not_sent_newcomers': 7253,
  'thanks_not_received': 14346,
  'thanks_not_received_newcomers': 7253,
  'thanks_not_sent': 14953,
  'thanks_not_sent_newcomers': 7253,
  'total_editors': 16946},
 {'lang': 'ar',
  'love_not_received': 3861,
  'love_not_received_newcomers': 2241,
  'love_not_sent': 3946,
  'love_not_sent_newcomers': 2241,
  'thanks_not_received': 3533,
  'thanks_not_received_newcomers': 2241,
  'thanks_not_sent': 3694,
  'thanks_not_sent_newcomers': 2241,
  'total_editors': 4138},
 {'lang': 'pt',
  'love_not_received': 5511,
  'love_not_received_newcomers': 2137,
  'love_not_sent': 5636,
  'love_not_sent_newcomers': 2137,
  'thanks_not_received': 4576,
  'thanks_not_received_newcomers': 2137,
  'thanks_not_sent': 5033,
  'thanks_not_sent_newcomers': 2137,
  'total_editors': 5893},
 {'lang': 'tr',
  'love_not_received': 709,
  'love_not_received_newcomers': 