## Initialize TensorFlow

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load Data

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('grantGov_raw.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Opportunity Number,Eligibility,Agency Name,Description
0,0,Transitions to Excellence in Molecular and Cel...,21-508,"Unrestricted (i.e., open to any type of entity...",National Science Foundation,The Division of Molecular and Cellular Bioscie...
1,1,Plant Genome Research Program,21-507,"Unrestricted (i.e., open to any type of entity...",National Science Foundation,\n\n\n\nThe Plant Genome Research Program (PGR...
2,2,Division of Integrative Organismal Systems Cor...,21-506,"Unrestricted (i.e., open to any type of entity...",National Science Foundation,\n\nThe Division of Integrative Organismal Sys...
3,3,Algebra and Number Theory,PD-20-1264,"Unrestricted (i.e., open to any type of entity...",National Science Foundation,The Algebra and Number Theory program supports...
4,4,Successor-in-Interest (Type 6 Parent Clinical ...,PA-20-275,Public housing authorities/Indian housing auth...,National Institutes of Health,The National Institutes of Health (NIH) hereby...


Applicable grant index:
31, 51, 75, 160, 184, 216, 271, 330, 355, 388, 395, 531

In [6]:
test = [31, 51, 75, 160, 184, 216, 271, 330, 355, 388, 395, 531] 

In [7]:
filteredTest = []
[filteredTest.append(data.loc[x]) for x in test]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [8]:
filteredDf = data.loc[test]

In [87]:
texts = filteredDf['Description'].to_list()

# TFIDF selecting keywords

### TF_IDF (Term frequency inverse document frequency): 
* https://en.wikipedia.org/wiki/Tf–idf 
* TF_IDF(w,d,D) = Term_frequency(w,d) * Inverse_document_frequency(w,D)
    * w = word (term)
    * d = document
    * D = set of all documents
    * Term_frequency(w,d) = d.count(w)
    * Inverse_document_frequency(w,D) = log(|D|/|docs containing w|)
    
    
### TextBlob
* https://pythonprogramming.net/part-of-speech-tagging-nltk-tutorial/

## GRANTS.GOV

In [157]:
# Text cleaning
texts = [x.replace('\n', ' ') for x in texts]
texts = [x.replace("’", ' ') for x in texts]

# Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
# Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
# clean out capitalization
texts = [txt.lower() for txt in texts]

In [158]:
# texts

In [159]:
# # make bag of words
# blob = [txt.split() for txt in texts]

# Use TextBlob
# TFIDF with textblob: https://gist.github.com/sloria/6407257
import math
from textblob import TextBlob as tb
bloblist = [tb(x) for x in texts]

In [160]:
#first few tags
bloblist[0].tags[:6]

[('the', 'DT'),
 ('advancing', 'VBG'),
 ('informal', 'JJ'),
 ('stem', 'NN'),
 ('learning', 'VBG'),
 ('aisl', 'JJ')]

In [161]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [162]:
num = []
wrd = []
scr = []
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:20]:
        print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))
        num.append(i)
        wrd.append(word)
        scr.append(round(score, 5))

Top words in document 1
Word: aisl, TF-IDF: 0.03772
Word: informal, TF-IDF: 0.03686
Word: learning, TF-IDF: 0.02269
Word: evidencebased, TF-IDF: 0.01886
Word: learningopportunities, TF-IDF: 0.01886
Word: forthe, TF-IDF: 0.01886
Word: assessment, TF-IDF: 0.01886
Word: environmentsthe, TF-IDF: 0.01886
Word: six, TF-IDF: 0.01886
Word: pilots, TF-IDF: 0.01886
Word: feasibility, TF-IDF: 0.01886
Word: literature, TF-IDF: 0.01886
Word: reviews, TF-IDF: 0.01886
Word: syntheses, TF-IDF: 0.01886
Word: metaanalyses, TF-IDF: 0.01886
Word: environments, TF-IDF: 0.01843
Word: public, TF-IDF: 0.01459
Word: access, TF-IDF: 0.01459
Word: engagement, TF-IDF: 0.01459
Word: practice, TF-IDF: 0.01459
Top words in document 2
Word: graduate, TF-IDF: 0.03194
Word: ige, TF-IDF: 0.02531
Word: hub, TF-IDF: 0.02477
Word: implementation, TF-IDF: 0.01917
Word: centers, TF-IDF: 0.01651
Word: testing, TF-IDF: 0.01278
Word: potentially, TF-IDF: 0.01013
Word: transformative, TF-IDF: 0.01013
Word: knowledge, TF-IDF: 0.0

In [163]:
#save data to csv
# dictionary of lists   
dict = {'Grant#': num, 'Word': wrd, 'TFIDF Score': scr}         
df = pd.DataFrame(dict)      
# saving the dataframe  
df.to_csv('Top20Words.csv')  

## GRANTFWD

In [None]:
#TBD

# TFIDF with Tensorflow

In [63]:
#McClure, Nick. Tensorflow Machine Learning Cookbook. Packt Publishing, 2017.

import tensorflow as tf
import matplotlib.pyplot as plt
import csv
import numpy as np
import os
import string
import requests
import io
import nltk
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
sess = tf.compat.v1.Session()
batch_size= 200
max_features = 1000

In [None]:
# label
texts = [x[1] for x in text_data]
target = [x[0] for x in text_data]
# Relabel 'yes' as 1, 'no' as 0
target = [1. if x=='yes' else 0. for x in target]

In [80]:
#clean symbols
texts = filteredDf['Description'].to_list()
texts = [x.replace('\n', '') for x in texts]
# Lower case
texts = [x.lower() for x in texts]
# Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
# Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
# Trim extra whitespace
texts = [' '.join(x.split()) for x in texts]

In [81]:
len(texts)

12

In [84]:
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words
# Create TF-IDF of texts
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
sparse_tfidf_texts = tfidf.fit_transform(texts)

In [86]:
#break data set in to test and train set
#label filtered data TBD
train_indices = np.random.choice(sparse_tfidf_texts.shape[0],round(0.8*sparse_tfidf_texts.shape[0]), replace=False)
test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0])) - set(train_indices)))
texts_train = sparse_tfidf_texts[train_indices]
texts_test = sparse_tfidf_texts[test_indices]
target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])

NameError: name 'target' is not defined