In [71]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import keras

In [275]:
# in case a person has several applications on the same day,
# we need to agg to a single line per day and person by using the following functions
day_agg_dict = {'Dept.No.' : lambda x : x.mode(),
               'Faculty.No.': lambda x : x.mode(),
               'With.PHD' : lambda x : x.max(),
                'years_in_uni' : lambda x : x.max(),
                'Number.of.Successful.Grant' : lambda x : x.max(),
                'Number.of.Unsuccessful.Grant' : lambda x : x.max(),
                'A.':lambda x : x.max(),
                'A': lambda x : x.max(),
                'B':lambda x : x.max(),
                'C': lambda x : x.max()}

# the cummulative max will be applied on the following time series
apply_max = ['With.PHD','years_in_uni','Number.of.Successful.Grant','Number.of.Unsuccessful.Grant','A.','A','B','C']

#
input_type = 'train'

In [276]:

df_p_dyn = pd.read_csv( '../data/'+ input_type + '_person_dyn_raw.csv',low_memory=False, parse_dates=['date'])

df_p_dyn.loc[:,'Person.ID'] = df_p_dyn.loc[:,'Person.ID'].astype(str)
df_p_dyn.loc[:,'Dept.No.'] = df_p_dyn.loc[:,'Dept.No.'].astype(str)
df_p_dyn.loc[:,'Faculty.No.'] = df_p_dyn.loc[:,'Faculty.No.'].astype(str)

In [277]:
# first step is to replace all 'nan' strings to np.nan
df_p_dyn.loc[df_p_dyn.loc[:,'Dept.No.'] == 'nan','Dept.No.'] = np.nan
df_p_dyn.loc[df_p_dyn.loc[:,'Faculty.No.'] == 'nan','Faculty.No.'] = np.nan
df_p_dyn.loc[df_p_dyn.loc[:,'With.PHD'] == 'nan','With.PHD'] = np.nan

In [278]:
# we set the phd coloumn to numerical Yes == 1
df_p_dyn.loc[:,'With.PHD'] = df_p_dyn.loc[:,'With.PHD'].str.strip()
df_p_dyn = df_p_dyn.replace({'With.PHD' : {'Yes' : '1'}})
df_p_dyn.loc[:,'With.PHD'] = df_p_dyn.loc[:,'With.PHD'].astype(float)

In [279]:
df_p_dyn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9741 entries, 0 to 9740
Data columns (total 12 columns):
date                            9741 non-null datetime64[ns]
Person.ID                       9741 non-null object
Dept.No.                        8969 non-null object
Faculty.No.                     9063 non-null object
With.PHD                        5819 non-null float64
years_in_uni                    8442 non-null float64
Number.of.Successful.Grant      9741 non-null float64
Number.of.Unsuccessful.Grant    9741 non-null float64
A.                              9741 non-null float64
A                               9741 non-null float64
B                               9741 non-null float64
C                               9739 non-null float64
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 913.3+ KB


In [280]:
# we generate a dataframe with the uniqe key pairs ( date, person Id).
# this df will be filled and returned as the mod frame

df_p_dyn_mod = df_p_dyn.loc[:,['date', 'Person.ID']].drop_duplicates()
df_p_dyn_mod.shape

(8030, 2)

In [281]:
tmp = df_p_dyn.loc[df_p_dyn.loc[:,'Person.ID'].isin(['79192','10002','48497']) ,:]

In [282]:
# 
for col, fun in day_agg_dict.items():
 
    # we apply the aggregation function to the coloumn
    tmp2 = df_p_dyn.groupby(['date', 'Person.ID'])[col].agg(fun)
    
    # in case the aggregation function returns a list and not a single element, we take the first one
    # if there were only NaN for this day and persion ID, an empty np array is returned-> we chagne it to a NaN
    tmp2 = tmp2.apply(lambda x : x[0] if (isinstance(x, np.ndarray) and len(x) > 0 ) else \
               (np.nan if (isinstance(x, np.ndarray) and len(x) == 0) else x  ))
    
    # we merge the series with the dataframe that stores all the outcome
    df_p_dyn_mod = pd.merge(df_p_dyn_mod, tmp2.to_frame(name = col).reset_index(),how='left',on = ['date', 'Person.ID'])

In [283]:
# we apply the cummulative maximum on all in the list apply_max
tmp2 = df_p_dyn_mod.loc[:,['Person.ID','date'] + apply_max].set_index(['Person.ID','date']).sort_index().\
    groupby(['Person.ID']).cummax().reset_index()

# we now have to replace the original columns in df_p_dyn_mod with these mnodified columns, keeping the columns not in apply_max
df_p_dyn_mod = pd.merge(df_p_dyn_mod.loc[:, ~df_p_dyn_mod.columns.isin(apply_max)], tmp2,how='outer', on=['Person.ID','date'])

In [284]:
# we save the output as a csv
df_p_dyn_mod.to_csv('../data/' + input_type + '_person_dyn_mod.csv')

In [285]:
#tmp = df_p_dyn_mod.loc[df_p_dyn_mod.loc[:,'Person.ID'].isin(['79192','10002','48497']) ,:]
df_p_dyn_mod.info()
#tmp2.loc[tmp2.loc[:,'Person.ID'].isin(['79192','10002','48497']) ,:]
#tmp2

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8030 entries, 0 to 8029
Data columns (total 12 columns):
date                            8030 non-null datetime64[ns]
Person.ID                       8030 non-null object
Faculty.No.                     7395 non-null object
Dept.No.                        7327 non-null object
With.PHD                        4722 non-null float64
years_in_uni                    6847 non-null float64
Number.of.Successful.Grant      8030 non-null float64
Number.of.Unsuccessful.Grant    8030 non-null float64
A.                              8030 non-null float64
A                               8030 non-null float64
B                               8030 non-null float64
C                               8028 non-null float64
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 815.5+ KB


In [115]:
df_p_dyn_mod.loc[:,col] = tmp.groupby(['date', 'Person.ID']).agg(fun).\
    loc[:,col].apply(lambda x : x[0] if isinstance(x, np.ndarray) else x)

In [164]:
tmp2.loc[:,'Dept.No.'].apply(lambda x : x[0] if (isinstance(x, np.ndarray) and len(x) > 0 ) else \
                             (np.nan if (isinstance(x, np.ndarray) and len(x) == 0) else x  ))
                       #loc[:,col].apply(lambda x : x[0] if isinstance(x, np.ndarray) else x)

date        Person.ID
2005-11-19  79192        2523.0
2006-01-01  79192        2523.0
2006-01-06  79192        2523.0
2006-01-24  79192        2523.0
2006-02-11  79192        2523.0
2006-02-20  79192        2523.0
2006-04-04  79192        2523.0
2006-04-27  79192        2523.0
2006-05-12  79192        2523.0
2006-05-23  79192        2523.0
2006-05-25  79192        2523.0
2006-05-31  79192        2523.0
2006-07-20  79192        2523.0
2006-08-02  79192        2523.0
2006-08-16  79192        2523.0
2006-08-24  79192        2523.0
2006-08-31  10002           NaN
2006-09-22  79192        2523.0
2006-09-27  79192        2523.0
2006-10-31  79192        2523.0
2006-11-10  79192        2523.0
2006-11-21  79192        2523.0
2006-11-24  79192        2523.0
2006-11-27  79192        2523.0
2006-11-28  79192        2523.0
2006-12-21  79192        2523.0
2006-12-22  79192        2523.0
2007-01-01  79192        2523.0
2007-04-06  10002           NaN
2007-05-23  79192        2523.0
2007-07-30  79192 

In [160]:
len(tmp2.loc[:,'Dept.No.'][-2])

0

In [161]:
len(2)

TypeError: object of type 'int' has no len()

In [287]:
df_p_dyn_mod.set_index(['Person.ID','date']).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Faculty.No.,Dept.No.,With.PHD,years_in_uni,Number.of.Successful.Grant,Number.of.Unsuccessful.Grant,A.,A,B,C
Person.ID,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10002,2006-08-31,,,,,0.0,0.0,0.0,0.0,0.0,0.0
10002,2007-04-06,,,,,0.0,0.0,0.0,0.0,0.0,0.0
100062,2006-06-14,,,,,0.0,0.0,0.0,0.0,2.0,0.0
10052,2005-02-12,31.0,3028.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
10052,2006-05-10,31.0,3028.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
10072,2006-08-09,25.0,2813.0,,1.0,0.0,0.0,4.0,8.0,7.0,11.0
100772,2006-04-07,25.0,2728.0,1.0,1.0,0.0,0.0,4.0,3.0,1.0,0.0
100772,2007-09-22,25.0,2728.0,1.0,1.0,0.0,0.0,6.0,3.0,3.0,0.0
101042,2007-10-08,7.0,653.0,,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1012,2005-11-19,31.0,3123.0,1.0,2.0,0.0,0.0,8.0,1.0,11.0,0.0


In [70]:
df_p_dyn.loc[df_p_dyn.loc[:,'Dept.No.'].isnull(),:].sort_values(['Person.ID','date'])

Unnamed: 0,date,Person.ID,Dept.No.,Faculty.No.,With.PHD,years_in_uni,Number.of.Successful.Grant,Number.of.Unsuccessful.Grant,A.,A,B,C
3841,2006-08-31,10002,,,,,0.0,0.0,0.0,0.0,0.0,0.0
6903,2007-04-06,10002,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2783,2006-06-14,100062,,,,,0.0,0.0,0.0,0.0,2.0,0.0
5574,2006-11-27,10157,,,,,0.0,0.0,0.0,5.0,2.0,0.0
5865,2007-01-01,10157,,,,,0.0,0.0,0.0,5.0,2.0,0.0
4190,2006-09-15,101802,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2260,2006-03-28,102602,,,,,0.0,0.0,0.0,1.0,0.0,0.0
3130,2006-08-16,103187,,,,,0.0,0.0,0.0,0.0,0.0,0.0
7762,2007-10-08,103187,,,,,0.0,0.0,0.0,0.0,0.0,0.0
950,2005-11-19,103747,,,,,0.0,0.0,0.0,0.0,0.0,0.0
