In [235]:
import numpy as np
import pandas as pd
import os.path
import matplotlib.pyplot as plt
%matplotlib inline

base = '../data/'
path = 'train_apps_mod.csv'
apps = pd.read_csv(os.path.join(base,path), low_memory=False)

In [207]:
def split_sponsor_and_grant_category(apps):
    apps['sponsor_c']= apps['sponsor'].str.extract('([A-Z])',expand=False)
    apps['sponsor_n'] = apps['sponsor'].str.extract('(\d+)',expand=False)
    apps['grant_category_c']= apps['grant_category'].str.extract('([A-Z])',expand=False)
    apps['grant_category_n'] = apps['grant_category'].str.extract('(\d+)',expand=False)

def fill_nan_by_key(apps,key):
    values_agg = apps[key].value_counts().sort_index()
    values = values_agg.index
    prob   = values_agg.values
    prob = prob/int(prob.sum())
    mask = apps[key].isnull()
    rands = np.random.choice(values,mask.sum(),p=prob)
    apps.loc[mask,key] = rands
    
def fill_nans(apps):
    keys = ['grant_value','sponsor','grant_category']
    for k in keys:
        fill_nan_by_key(apps,k)
        
def rebin_values(apps):
    mask = apps['grant_value']>400001
    apps.loc[mask,'grant_value'] = 500000

In [236]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6633 entries, 0 to 6632
Data columns (total 6 columns):
id                6633 non-null int64
granted           6633 non-null int64
sponsor           6633 non-null object
grant_category    6633 non-null object
grant_value       6633 non-null float64
date              6633 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 311.0+ KB


In [239]:
apps.head(100)

Unnamed: 0,id,granted,sponsor,grant_category,grant_value,date
0,1,1,97A,10A,5000.0,8/11/05
1,2,1,2B,10A,10000.0,11/11/05
2,3,1,29A,10B,5000.0,14/11/05
3,4,1,40D,10B,20000.0,15/11/05
4,5,0,59C,10A,5000.0,16/11/05
5,6,1,4D,10A,5000.0,19/11/05
6,7,0,2B,10A,20000.0,19/11/05
7,8,0,28D,30B,5000.0,19/11/05
8,9,1,2B,10A,200000.0,19/11/05
9,10,1,2B,10A,5000.0,19/11/05


In [209]:
fill_nans(apps)
split_sponsor_and_grant_category(apps)
rebin_values(apps)

In [210]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6633 entries, 0 to 6632
Data columns (total 10 columns):
id                  6633 non-null int64
granted             6633 non-null int64
sponsor             6633 non-null object
grant_category      6633 non-null object
grant_value         6633 non-null float64
date                6633 non-null object
sponsor_c           6633 non-null object
sponsor_n           6633 non-null object
grant_category_c    6633 non-null object
grant_category_n    6633 non-null object
dtypes: float64(1), int64(2), object(7)
memory usage: 518.3+ KB


In [165]:
foo = apps['grant_value'].value_counts(dropna=False).sort_index()
foo = foo/foo.values.sum()
foo

 5000.0        0.281471
 10000.0       0.072968
 20000.0       0.053219
 30000.0       0.055329
 40000.0       0.034826
 50000.0       0.029248
 100000.0      0.044475
 200000.0      0.010855
 300000.0      0.002111
 400000.0      0.002864
 500000.0      0.000754
 600000.0      0.000302
 700000.0      0.000151
 900000.0      0.000302
 1000000.0     0.000302
 10000000.0    0.000905
NaN            0.409920
Name: grant_value, dtype: float64

In [162]:
fill_nan_in_values(apps,'grant_value')
foo = apps['grant_value'].value_counts(dropna=False).sort_index()
foo = foo/foo.values.sum()
foo

5000.0        0.471883
10000.0       0.123624
20000.0       0.087743
30000.0       0.096186
40000.0       0.058043
50000.0       0.051259
100000.0      0.077642
200000.0      0.020805
300000.0      0.003618
400000.0      0.004824
500000.0      0.001206
600000.0      0.000452
700000.0      0.000151
900000.0      0.000603
1000000.0     0.000452
10000000.0    0.001508
Name: grant_value, dtype: float64

In [87]:
values_agg = apps['grant_value'].value_counts()
values = values_agg.index
prob   = values_agg.values
prob = prob/int(prob.sum())

In [216]:
db = pd.get_dummies(apps['grant_value'])
db.corrwith(apps['granted']).sort_values()

10000.0    -0.063227
400000.0   -0.041863
300000.0   -0.014970
200000.0   -0.014510
5000.0     -0.009563
20000.0    -0.001464
500000.0    0.002058
100000.0    0.020379
30000.0     0.025168
50000.0     0.031895
40000.0     0.051221
dtype: float64

In [214]:
db = pd.get_dummies(apps['sponsor_n'])
db.corrwith(apps['granted']).sort_values()

24    -0.113810
6     -0.068481
89    -0.060379
62    -0.052710
205   -0.042128
33    -0.041879
90    -0.038669
166   -0.038656
252   -0.035653
269   -0.033479
101   -0.033473
215   -0.032331
40    -0.032193
60    -0.030642
2     -0.030323
247   -0.029341
77    -0.027758
103   -0.027750
174   -0.026783
150   -0.026783
210   -0.026783
97    -0.025629
143   -0.024918
188   -0.024046
176   -0.023954
59    -0.023776
172   -0.022753
173   -0.022405
158   -0.020743
245   -0.020743
         ...   
235    0.017810
48     0.017810
84     0.018707
203    0.020824
179    0.020875
36     0.020888
69     0.021413
219    0.021814
233    0.021814
239    0.021814
120    0.021814
253    0.021814
39     0.024042
208    0.024042
213    0.025191
194    0.028166
67     0.028166
148    0.029607
52     0.030857
266    0.030857
132    0.035636
5      0.036514
163    0.037801
9      0.041899
47     0.043480
55     0.043677
75     0.062645
51     0.064657
34     0.092906
21     0.120450
Length: 245, dtype: floa

In [212]:
db = pd.get_dummies(apps['sponsor'])
db.corrwith(apps['granted']).sort_values()

24D    -0.113810
6B     -0.068481
89A    -0.060379
62B    -0.052710
205A   -0.042128
33A    -0.041879
90B    -0.038669
166B   -0.038656
252D   -0.035653
269A   -0.033479
101A   -0.033473
215C   -0.032331
40D    -0.032193
60D    -0.030642
2B     -0.030323
247C   -0.029341
77A    -0.027758
103C   -0.027750
174B   -0.026783
150B   -0.026783
210B   -0.026783
97A    -0.025629
143C   -0.024918
188D   -0.024046
176D   -0.023954
59C    -0.023776
172D   -0.022753
173A   -0.022405
158B   -0.020743
245A   -0.020743
          ...   
235C    0.017810
48D     0.017810
84D     0.018707
203C    0.020824
179C    0.020875
36D     0.020888
69A     0.021413
219C    0.021814
233A    0.021814
239C    0.021814
120D    0.021814
253A    0.021814
39C     0.024042
208D    0.024042
213A    0.025191
194B    0.028166
67C     0.028166
148D    0.029607
52D     0.030857
266B    0.030857
132D    0.035636
5A      0.036514
163C    0.037801
9A      0.041899
47C     0.043480
55C     0.043677
75C     0.062645
51C     0.0646

In [222]:
(apps['sponsor_n'].value_counts()<5).sum()

141

In [224]:
foo = apps['sponsor_n'].value_counts()

In [225]:
foo[foo>5]

4      1678
2      1525
21      809
34      203
24      140
40      131
32      100
5        92
59       86
29       81
97       76
6        64
62       58
60       56
75       51
149      49
47       47
7        38
51       37
65       36
33       35
77       34
20       34
89       33
166      27
36       26
90       24
205      23
137      22
1        22
       ... 
100       9
86        9
113       9
234       8
132       8
94        8
53        8
39        7
222       7
13        7
173       7
112       7
15        7
208       7
241       7
105       6
247       6
78        6
138       6
266       6
161       6
27        6
135       6
26        6
203       6
68        6
54        6
144       6
52        6
91        6
Name: sponsor_n, Length: 95, dtype: int64

In [226]:
foo = apps['grant_category'].value_counts()

In [229]:
foo.index

Index(['10A', '30B', '50A', '20C', '10B', '30C', '30D', '20A', '30G', '30E',
       '40C', '30A', '30F'],
      dtype='object')

In [231]:
pd

<module 'pandas' from '/home/gelo/software/miniconda3/lib/python3.6/site-packages/pandas/__init__.py'>

In [232]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6633 entries, 0 to 6632
Data columns (total 6 columns):
id                6633 non-null int64
granted           6633 non-null int64
sponsor           5801 non-null object
grant_category    5801 non-null object
grant_value       3914 non-null float64
date              6633 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 311.0+ KB


In [242]:
base = '../data/'
path = 'train_externals_raw.csv'
apps = pd.read_csv(os.path.join(base,path), low_memory=False)

In [245]:
apps.fillna(0, inplace=True)

In [246]:
apps

Unnamed: 0,id,EXTERNAL_ADVISOR,EXT_CHIEF_INVESTIGATOR,STUDRES,STUD_CHIEF_INVESTIGATOR
0,3,0.0,3.0,0.0,0.0
1,4,0.0,3.0,0.0,1.0
2,7,0.0,1.0,0.0,0.0
3,8,0.0,1.0,0.0,0.0
4,9,0.0,2.0,0.0,0.0
5,10,0.0,3.0,0.0,0.0
6,12,0.0,0.0,0.0,1.0
7,14,0.0,2.0,0.0,0.0
8,15,0.0,2.0,0.0,0.0
9,19,0.0,2.0,0.0,0.0
