## Goal 
### glue together many df.

### Stats
+ What are summary statistics of each feature?
+ How has thanking changed over time?
+ How is gratitude distributes among users?
+ What do the distirbutions of each feature look like?
+ For multi-window features how do they look together?

In [1]:
import pyarrow as pa
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import os
# from collections import defaultdict
%pylab inline
import json
from datetime import datetime as dt
nat_filler = dt(1970,1,1)

Populating the interactive namespace from numpy and matplotlib


In [2]:
langcodes = json.load(open('configs/ores.json','r'))['langcodes']

In [3]:
outputs_dir = 'outputs'
file_list = os.listdir(outputs_dir)

In [4]:
files = dict()
for f in file_list:
    parts = f.split('_')
    thanklove = parts[0].split('wiki')[1]
    lang = parts[1]

    tdf = pd.read_csv(os.path.join(outputs_dir, f), parse_dates=['timestamp', 'receiver_first_edit', 'sender_first_edit'])
    
    if thanklove == 'love':
        # i will fix this upstream so this shouldn't have to be done in the future.
        tdf['wll_type'] = tdf['wll_type'].apply(lambda x: x.replace("b'","").replace("'",""))
    
    #feather requires homogenous datatypes
    tdf['receiver'] = tdf['receiver'].astype(str)
    tdf['sender'] = tdf['sender'].astype(str)
    
    tdf['thanklove'] = thanklove
    tdf['lang'] = lang
    files[f] = tdf
#print(files)

In [5]:
df = pd.concat(files.values())
df = df.reset_index(drop=True)
#df = df.fillna({'sender_first_edit':nat_filler, 'receiver_first_edit':nat_filler})
df['probably_deleted'] = (pd.isna(df['sender_first_edit'])) | (pd.isna(df['receiver_first_edit']))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [6]:
today = dt.today().strftime("%Y%m%d")
df.to_feather(f'transfer/gratitude_{today}.feather')
df.to_csv(f'transfer/gratitude_{today}.csv', index=False)

In [8]:
!du -h transfer/*

121M	transfer/gratitude_20180628.csv
135M	transfer/gratitude_20180629.csv
146M	transfer/gratitude_20180629.feather


In [9]:
def frac_nan(df):
    sender_nan = df[pd.isna(df['sender_first_edit'])]
    receiver_nan = df[pd.isna(df['receiver_first_edit'])]
    either_nan = df[(pd.isna(df['sender_first_edit'])) | (pd.isna(df['receiver_first_edit']))]
    denom = len(df)
    sender_nan_frac = len(sender_nan) / denom
    receiver_nan_frac = len(receiver_nan) / denom
    either_nan_frac = len(either_nan) / denom
    return {'sender_nan_frac': sender_nan_frac,
            'receiver_nan_frac': receiver_nan_frac,
            'either_nan_frac': either_nan_frac}

In [38]:
frac_nan(df)

{'either_nan_frac': 0.017737483418911914,
 'receiver_nan_frac': 0.0038653056285516633,
 'sender_nan_frac': 0.014721494797933329}

In [10]:
nan_fracs = {}
for fname, df in files.items():
    nan_fracs[fname] = frac_nan(df)
nan_frac_df = pd.DataFrame.from_dict(nan_fracs, orient='index')

In [41]:
nan_frac_df.sort_values('either_nan_frac')

Unnamed: 0,sender_nan_frac,receiver_nan_frac,either_nan_frac
wikithank_he_20180627.csv,0.001004,0.000854,0.001858
wikilove_he_20180626.csv,0.005771,0.003463,0.00831
wikilove_tr_20180622.csv,0.0,0.01,0.01
wikithank_es_20180626.csv,0.013939,0.003094,0.016594
wikithank_pt_20180627.csv,0.017571,0.002614,0.019997
wikilove_ar_20180623.csv,0.008521,0.018583,0.024101
wikilove_es_20180625.csv,0.01837,0.008183,0.025718
wikithank_ar_20180623.csv,0.023409,0.004375,0.027191
wikithank_tr_20180622.csv,0.04,0.0,0.04
wikilove_pt_20180627.csv,0.059546,0.037672,0.070213


In [12]:
nan_frac_df.mean()

sender_nan_frac      0.018813
receiver_nan_frac    0.008884
either_nan_frac      0.024398
dtype: float64

In [13]:
df_filled = df.fillna(nat_filler)

In [126]:
frac_nan(df_filled)

{'either_nan_frac': 0.0, 'receiver_nan_frac': 0.0, 'sender_nan_frac': 0.0}

In [122]:
len(df1)

2306

In [123]:
df1.head()

Unnamed: 0,timestamp,receiver,receiver_id,sender,sender_id,receiver_prev_received,receiver_prev_sent,sender_prev_received,sender_prev_sent,receiver_prev_received_indicator,sender_prev_received_indicator,sender_prev_sent_indicator,receiver_prev_sent_indicator,receiver_prev_edits,sender_prev_edits,sender_first_edit,receiver_first_edit,sender_edits_1d_after,sender_edits_30d_after,sender_edits_90d_after,sender_edits_180d_after,receiver_edits_1d_after,receiver_edits_30d_after,receiver_edits_90d_after,receiver_edits_180d_after,receiver_thank_another_1d_after,receiver_thank_another_30d_after,receiver_thank_another_90d_after,receiver_thank_another_180d_after,thanklove,lang
2,2013-09-17 19:11:18,Fabrice Florin,2234967,Fabrice Florin (WMF),2756867,0,0,0,0,False,False,False,False,1.0,0.0,NaT,2013-09-17 19:10:11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,thank,es
5,2013-09-17 20:13:36,JEissfeldt (WMF),2617236,Fabrice Florin (WMF),2756867,0,0,0,1,False,False,True,False,102.0,0.0,NaT,2013-04-27 22:59:33,0.0,0.0,0.0,0.0,3.0,12.0,18.0,21.0,0,0,0,0,thank,es
11,2013-09-18 02:25:27,La Avatar Korra,3042121,Nickelodeon-fan,2129864,0,0,0,0,False,False,False,False,0.0,7656.0,2012-02-21 23:25:03,NaT,11.0,288.0,611.0,804.0,0.0,0.0,0.0,0.0,0,0,0,0,thank,es
25,2013-09-18 18:26:33,Moraleh~eswiki,3083522,Gauri,1371126,0,0,1,1,False,True,True,False,0.0,30870.0,2010-02-10 21:15:03,NaT,64.0,924.0,1951.0,1981.0,0.0,0.0,0.0,0.0,0,0,0,0,thank,es
26,2013-09-18 18:26:50,Moraleh~eswiki,3083522,Gauri,1371126,1,0,1,2,True,True,True,False,0.0,30870.0,2010-02-10 21:15:03,NaT,64.0,924.0,1951.0,1981.0,0.0,0.0,0.0,0.0,0,0,0,0,thank,es


In [31]:
df1['thanklove']

thank    72550
love       298
Name: thanklove, dtype: int64

In [33]:
ardf = files['wikithank_ar_20180623.csv']

In [37]:
ardf[pd.isna(ardf['receiver_first_edit'])]

(317, 31)

In [23]:
ddtypes = df.dtypes

In [20]:
df[[c for c in df.columns if c!= 'wll_type']].to_feather('outputs/gratitude.feather')

ArrowInvalid: Error converting from Python objects to Int64: Got Python object of type str but can only handle these types: integer

In [None]:
df.tail()

In [None]:
len(df)

In [None]:
df[pd.isnull(df['wll_type'])]

In [None]:
table = pa.Table.from_pandas(trdf)

In [None]:
trdf = files['wikithank_tr_20180622.csv']
trdf['a'] = 'a'
trdf = trdf.astype({'a':'|S'})

In [None]:
trdf.dtypes

In [None]:
trdf.dtypes

In [None]:
df['lang'].value_counts()

In [None]:
df.dtypes

In [None]:
odf = df.reset_index(drop=True)

In [None]:
odf.dtypes

In [None]:
odf['lang'].astype(str)

In [None]:
odf.astype()to_feather('outputs/gratitude.feather')

In [None]:
#num observations

In [None]:
thankdf['timestamp'].hist