## Goal 
### glue together many df.

### Stats
+ What are summary statistics of each feature?
+ How has thanking changed over time?
+ How is gratitude distributes among users?
+ What do the distirbutions of each feature look like?
+ For multi-window features how do they look together?

In [1]:
import pyarrow as pa
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import os
# from collections import defaultdict
%pylab inline
import json
from datetime import datetime as dt
nat_filler = dt(1970,1,1)

Populating the interactive namespace from numpy and matplotlib


In [2]:
langcodes = json.load(open('configs/ores.json','r'))['langcodes']

In [3]:
outputs_dir = 'outputs'
file_list = os.listdir(outputs_dir)

In [4]:
files = dict()
for f in file_list:
    parts = f.split('_')
    thanklove = parts[0].split('wiki')[1]
    lang = parts[1]

    tdf = pd.read_csv(os.path.join(outputs_dir, f), parse_dates=['timestamp', 'receiver_first_edit', 'sender_first_edit'])
    
    if thanklove == 'love':
        # i will fix this upstream so this shouldn't have to be done in the future.
        tdf['wll_type'] = tdf['wll_type'].apply(lambda x: x.replace("b'","").replace("'",""))
    
    #feather requires homogenous datatypes
    tdf['receiver'] = tdf['receiver'].astype(str)
    tdf['sender'] = tdf['sender'].astype(str)
    
    tdf['thanklove'] = thanklove
    tdf['lang'] = lang
    files[f] = tdf
#print(files)

In [5]:
df = pd.concat(files.values())
df = df.reset_index(drop=True)
#df = df.fillna({'sender_first_edit':nat_filler, 'receiver_first_edit':nat_filler})
df['probably_deleted'] = (pd.isna(df['sender_first_edit'])) | (pd.isna(df['receiver_first_edit']))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [6]:
mapper = {'thanklove':'gratitude',
         'receiver_thank_another_1d_after':'receiver_gratitude_another_1d_after',
         'receiver_thank_another_30d_after':'receiver_gratitude_another_30d_after',
         'receiver_thank_another_90d_after':'receiver_gratitude_another_90d_after',
         'receiver_thank_another_180d_after':'receiver_gratitude_another_180d_after',
            }
df.rename(mapper, axis='columns', inplace=True)

In [7]:
#redo column order
new_order = ['lang',
            'timestamp',
             'gratitude',
             'receiver',
            'sender',
            'receiver_id',
            'sender_id',
             'wll_type',
            'probably_deleted',

            'receiver_prev_edits',
            'receiver_prev_received',
            'receiver_prev_received_indicator',
            'receiver_prev_sent',
            'receiver_prev_sent_indicator',

             
            'receiver_edits_1d_after',
            'receiver_edits_30d_after',
            'receiver_edits_90d_after',
            'receiver_edits_180d_after',

            
            'receiver_gratitude_another_1d_after',
            'receiver_gratitude_another_30d_after',
            'receiver_gratitude_another_90d_after',
            'receiver_gratitude_another_180d_after',

            'receiver_first_edit',
            
            'sender_prev_edits',
            'sender_prev_received',
            'sender_prev_received_indicator',
            'sender_prev_sent',
            'sender_prev_sent_indicator',

            
            'sender_edits_1d_after',
            'sender_edits_30d_after',
            'sender_edits_90d_after',
            'sender_edits_180d_after',
             
            'sender_first_edit',
            ]

df = df[new_order]

In [8]:
today = dt.today().strftime("%Y%m%d")

df.to_feather(f'transfer/gratitude_{today}.feather')
df.to_csv(f'transfer/gratitude_{today}.csv', index=False)

In [9]:
!du -h transfer/*

121M	transfer/gratitude_20180628.csv
135M	transfer/gratitude_20180629.csv
146M	transfer/gratitude_20180629.feather
354M	transfer/gratitude_20180717.csv
384M	transfer/gratitude_20180717.feather
465M	transfer/gratitude_20180718.csv
503M	transfer/gratitude_20180718.feather


In [10]:
def frac_nan(df):
    sender_nan = df[pd.isna(df['sender_first_edit'])]
    receiver_nan = df[pd.isna(df['receiver_first_edit'])]
    either_nan = df[(pd.isna(df['sender_first_edit'])) | (pd.isna(df['receiver_first_edit']))]
    denom = len(df)
    sender_nan_frac = len(sender_nan) / denom
    receiver_nan_frac = len(receiver_nan) / denom
    either_nan_frac = len(either_nan) / denom
    return {'sender_nan_frac': sender_nan_frac,
            'receiver_nan_frac': receiver_nan_frac,
            'either_nan_frac': either_nan_frac}

In [11]:
frac_nan(df)

{'either_nan_frac': 0.011387090296091773,
 'receiver_nan_frac': 0.0020621767066922986,
 'sender_nan_frac': 0.009625260647281085}

In [12]:
nan_fracs = {}
for fname, df in files.items():
    nan_fracs[fname] = frac_nan(df)
nan_frac_df = pd.DataFrame.from_dict(nan_fracs, orient='index')

In [13]:
nan_frac_df.sort_values('either_nan_frac')

Unnamed: 0,sender_nan_frac,receiver_nan_frac,either_nan_frac
wikithank_he_20180627.csv,0.001004,0.000854,0.001858
wikithank_de_20180712.csv,0.002218,0.000606,0.002805
wikithank_et_20180704.csv,0.002881,0.000206,0.003087
wikithank_no_20180703.csv,0.002247,0.001324,0.00317
wikithank_hu_20180627.csv,0.003014,0.000328,0.003343
wikithank_sv_20180628.csv,0.003561,0.000838,0.00415
wikilove_uk_20180703.csv,0.001439,0.002878,0.004317
wikithank_cs_20180704.csv,0.003793,0.000722,0.004392
wikithank_hr_20180702.csv,0.004275,0.00057,0.004845
wikithank_uk_20180703.csv,0.004158,0.000715,0.00486


In [14]:
nan_frac_df.mean()

sender_nan_frac      0.017730
receiver_nan_frac    0.006114
either_nan_frac      0.022254
dtype: float64

In [15]:
df_filled = df.fillna(nat_filler)

In [16]:
frac_nan(df_filled)

{'either_nan_frac': 0.0, 'receiver_nan_frac': 0.0, 'sender_nan_frac': 0.0}

In [17]:
len(df1)

NameError: name 'df1' is not defined

In [None]:
df1.head()

In [None]:
df1['thanklove']

In [None]:
ardf = files['wikithank_ar_20180623.csv']

In [None]:
ardf[pd.isna(ardf['receiver_first_edit'])]

In [None]:
ddtypes = df.dtypes

In [None]:
df[[c for c in df.columns if c!= 'wll_type']].to_feather('outputs/gratitude.feather')

In [None]:
df.tail()

In [None]:
len(df)

In [None]:
df[pd.isnull(df['wll_type'])]

In [None]:
table = pa.Table.from_pandas(trdf)

In [None]:
trdf = files['wikithank_tr_20180622.csv']
trdf['a'] = 'a'
trdf = trdf.astype({'a':'|S'})

In [None]:
trdf.dtypes

In [None]:
trdf.dtypes

In [None]:
df['lang'].value_counts()

In [None]:
df.dtypes

In [None]:
odf = df.reset_index(drop=True)

In [None]:
odf.dtypes

In [None]:
odf['lang'].astype(str)

In [None]:
odf.astype()to_feather('outputs/gratitude.feather')

In [None]:
#num observations

In [None]:
thankdf['timestamp'].hist