### Convert to "utf-8" encoded

In [1]:
! in2csv -e iso-8859-1 'RePORTER_PRJ_C_FY2016.csv' > sandbox.csv

### Get a numbered list of the columns

In [2]:
! csvcut -n sandbox.csv

  1: APPLICATION_ID
  2: ACTIVITY
  3: ADMINISTERING_IC
  4: APPLICATION_TYPE
  5: ARRA_FUNDED
  6: AWARD_NOTICE_DATE
  7: BUDGET_START
  8: BUDGET_END
  9: CFDA_CODE
 10: CORE_PROJECT_NUM
 11: ED_INST_TYPE
 12: FOA_NUMBER
 13: FULL_PROJECT_NUM
 14: FUNDING_ICs
 15: FUNDING_MECHANISM
 16: FY
 17: IC_NAME
 18: NIH_SPENDING_CATS
 19: ORG_CITY
 20: ORG_COUNTRY
 21: ORG_DEPT
 22: ORG_DISTRICT
 23: ORG_DUNS
 24: ORG_FIPS
 25: ORG_NAME
 26: ORG_STATE
 27: ORG_ZIPCODE
 28: PHR
 29: PI_IDS
 30: PI_NAMEs
 31: PROGRAM_OFFICER_NAME
 32: PROJECT_START
 33: PROJECT_END
 34: PROJECT_TERMS
 35: PROJECT_TITLE
 36: SERIAL_NUMBER
 37: STUDY_SECTION
 38: STUDY_SECTION_NAME
 39: SUBPROJECT_ID
 40: SUFFIX
 41: SUPPORT_YEAR
 42: DIRECT_COST_AMT
 43: INDIRECT_COST_AMT
 44: TOTAL_COST
 45: TOTAL_COST_SUB_PROJECT


### Extract just the grant type (activity), investigator IDs, name and total cost

In [3]:
! csvcut -c 2,29,30,44 sandbox.csv > temp && mv temp sandbox-cut.csv

### Take a look at the grant types

In [8]:
! csvcut -c 1 sandbox-cut.csv | csvlook | head

|------------|
|  ACTIVITY  |
|------------|
|  K23       |
|  R01       |
|  D43       |
|  R01       |
|  R01       |
|  P41       |
|  U01       |


### Remove any grant type that is not RO1

In [11]:
! csvgrep -c 1 -m 'R01' sandbox-cut.csv > sandbox-R01.csv

### Take a look at the grant type

In [13]:
! csvcut -c 1 sandbox-R01.csv | csvlook | head

|------------|
|  ACTIVITY  |
|------------|
|  R01       |
|  R01       |
|  R01       |
|  R01       |
|  R01       |
|  R01       |
|  R01       |


### Load pandas

In [14]:
import pandas

### Load the data into a dataframe

In [16]:
nih_df = pandas.read_csv("./sandbox-R01.csv")
nih_df.head(10)

Unnamed: 0,ACTIVITY,PI_IDS,PI_NAMEs,TOTAL_COST
0,R01,7017365;,"POLSTER, BRIAN M;",335781
1,R01,2275890 (contact); 8742217; 6139020;,"DULAC, CATHERINE G (contact); REGEV, AVIV ; ZH...",1593922
2,R01,9868481; 2063694 (contact);,"CHEN, XUESONG ; GEIGER, JONATHAN DAVID (contact);",294755
3,R01,9335858;,"QI, LING ;",202611
4,R01,10940848; 1897156 (contact);,"LOZANO, ANDRES M.; LYKETSOS, CONSTANTINE G (co...",472366
5,R01,1874169;,"CLARKE, GREGORY N;",679780
6,R01,1879326;,"FISHER, STEVEN K;",384375
7,R01,1901372; 3150937 (contact);,"IADECOLA, COSTANTINO ; MANFREDI, GIOVANNI (co...",370781
8,R01,8769887 (contact); 1880468;,"CICHEWICZ, ROBERT HENRY (contact); MOOBERRY, S...",555984
9,R01,6444105;,"FREEDLAND, KENNETH E;",698539


### Drop any rows that don't have a total cost

In [17]:
nih_df2 = nih_df.dropna()
nih_df2.head()

Unnamed: 0,ACTIVITY,PI_IDS,PI_NAMEs,TOTAL_COST
0,R01,7017365;,"POLSTER, BRIAN M;",335781
1,R01,2275890 (contact); 8742217; 6139020;,"DULAC, CATHERINE G (contact); REGEV, AVIV ; ZH...",1593922
2,R01,9868481; 2063694 (contact);,"CHEN, XUESONG ; GEIGER, JONATHAN DAVID (contact);",294755
3,R01,9335858;,"QI, LING ;",202611
4,R01,10940848; 1897156 (contact);,"LOZANO, ANDRES M.; LYKETSOS, CONSTANTINE G (co...",472366


### Collect all of the names into one long string and remove the word '(contact)'

In [18]:
names = ""
for index, row in nih_df2.iterrows():
    names += row['PI_NAMEs']
    
noContact = names.replace("(contact)","")

### Split the string into a list on each semicolon

In [19]:
listOfNames = noContact.split(";")

### Remove traling whitespace from each name

In [20]:
listOfNames = [x.strip(" ") for x in listOfNames]

### Remove duplicates

In [21]:
uniqueNames = [];
for item in listOfNames:
    if item not in uniqueNames:
        if len(item) > 1:
            uniqueNames.append(item)

### Create a new data frame to store our records

In [22]:
total_df = pandas.DataFrame({
        'name': uniqueNames
    })

total_df['total-amount'] = 0
total_df['number-of-grants'] = 0

total_df.head(10)

Unnamed: 0,name,total-amount,number-of-grants
0,"POLSTER, BRIAN M",0,0
1,"DULAC, CATHERINE G",0,0
2,"REGEV, AVIV",0,0
3,"ZHUANG, XIAOWEI",0,0
4,"CHEN, XUESONG",0,0
5,"GEIGER, JONATHAN DAVID",0,0
6,"QI, LING",0,0
7,"LOZANO, ANDRES M.",0,0
8,"LYKETSOS, CONSTANTINE G",0,0
9,"CLARKE, GREGORY N",0,0


### Total up the cost for each instance of the names

In [23]:
for i, totalElem in total_df.iterrows():
    thisName = totalElem['name']
    if thisName != 'nan':
        name_correct_index = nih_df2['PI_NAMEs'].str.contains(thisName, case=True)
        name_correct = nih_df2[name_correct_index]
        summed = name_correct['TOTAL_COST'].sum()
        number = len(name_correct['PI_NAMEs'])
        total_df.set_value(i, 'total-amount', summed)
        total_df.set_value(i, 'number-of-grants', number)


total_df.head(10)



Unnamed: 0,name,total-amount,number-of-grants
0,"POLSTER, BRIAN M",335781,1
1,"DULAC, CATHERINE G",3506620,4
2,"REGEV, AVIV",2806468,2
3,"ZHUANG, XIAOWEI",3114521,3
4,"CHEN, XUESONG",607505,2
5,"GEIGER, JONATHAN DAVID",607505,2
6,"QI, LING",1260150,5
7,"LOZANO, ANDRES M.",472366,1
8,"LYKETSOS, CONSTANTINE G",4397431,3
9,"CLARKE, GREGORY N",2191656,3


### Count rows in new dataframe

In [24]:
len(total_df['name'])

21594

### Sort by total-amount

In [25]:
sorted_df = total_df.sort('total-amount', ascending=False)
sorted_df.head(10)

Unnamed: 0,name,total-amount,number-of-grants
589,"BURNS, JEFFREY MURRAY",8049330,3
16133,"WANG, XIAO",7824555,19
11711,"JOHNSTON, LLOYD D",6991146,2
14894,"LI, JI",6668040,13
3358,"WANG, YI",6568160,12
6328,"SASTRY, NARAYAN",6561002,5
1501,"BATEMAN, RANDALL J",6517026,3
14641,"JOHNSON, KEITH A.",6274848,4
10994,"AISEN, PAUL S.",5949179,3
8439,"CHEN, XI",5900198,17


### Export to a new csv

In [26]:
sorted_df.to_csv('sandbox-output-r01.csv', sep=',')

### Count the number of lines in this new file

In [27]:
! wc -l sandbox-output-r01.csv

   21595 sandbox-output-r01.csv


### Drop the first column

In [30]:
! csvcut -c 2,3,4 sandbox-output-r01.csv > temp && mv temp sandbox-output-r01.csv