# Tokens to Keywords
This notebook contains the functions to perform TF-IDF technique to get keywords and scores.
***

### Functions

1.  Extract Keywords
    * get_tf_idf(tokens)
    * getKeywords(pathToPDF, pathToCommonWordsCsv)
    * saveKeywordsAsCsv(keywords_df, filename, pathToKeywordsFolder)

2. Test Keywords
    * getCorrectKeywords(pathToFile)
    * testFilename(pathToFile)


***

#### 1. Extract Keywords

Extract keywords with TF-IDF.

In [None]:
"""
Run tf-idf algorithm on the list of tokens, and
return a dataframe with tokens and scores
"""
def get_TF_IDF(tokens: list):
    # Compute TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(tokens)

    tf_idf = list(tfidf_vectorizer.idf_) #scores
    features = list(tfidf_vectorizer.get_feature_names()) #words/tokens

    # Store results in dataframe
    scores_df = pd.DataFrame(list(zip(features, tf_idf)), 
                             columns=['Keywords', 'TF-IDF'])
    # Sort by score in ascending - small to large - order
    scores_df = scores_df.sort_values('TF-IDF').reset_index(drop=True)
    
    return scores_df

In [None]:
"""
Combine all functions to get keywords dataframe from just path to file
"""
def getKeywords(pathToPDF, pathToCommonWordsCsv=""): 
  # Run process for extracting keywords
  nlp = spacy.load('en')
  rawText = getText(pathToPDF)
  preprocessedText = preprocessText(rawText)
  sentences = getSentences(preprocessedText, nlp)
  tokens = getTokens(sentences, nlp, pathToCommonWordsCsv)

  # Get dataframe with keywords and scores
  keywords_df = get_TF_IDF(tokens)

  return keywords_df

In [None]:
"""
Save dataframe with keywords and tf-idf scores into .csv file,
using the same name (e.g. SanJose1.pdf => SanJose1.csv)

keywords_df is result from function getKeywords()s
"""
def saveKeywordsAsCsv(keywords_df, filename, pathToKeywordsFolder):
    # Create csv file name
    name = filename[:-4]
    csvFilename = name + ".csv"

    # Save file in folder
    pathToFile = pathToKeywordsFolder + csvFilename
    keywords_df.to_csv(pathToFile, index=False)
    print(f"Saved as {csvFilename}")

    return csvFilename

***
#### 2. Test Keywords

Compare extracted keywords with manually selected keywords.


In [None]:
"""
Extract words from filename, 
which are separated by an underscore _
"""
def getCorrectKeywords(pathToFile: str) -> list:
    filename = pathToFile.split('/')[-1]
    correctKeywords = filename.replace(".pdf", "").split('_')
    correctKeywords = [word.lower() for word in correctKeywords]
    return correctKeywords

In [None]:
"""
Test one file
Compare correct keywords with the top 10 keywords computed with tf-idf
"""
def testFilename(pathToFile: str) -> float:
    # Get top 10 keywords using tf-idf
    tf_idf_keywords = getKeywords(pathToFile)['Keywords'].to_list()[:10]

    # Get actual keywords from file name
    correctKeywords = getCorrectKeywords(pathToFile)

    # Count number of correct words in keywords
    numCorrectWordsFound = 0
    for keyword in correctKeywords:
        if keyword in tf_idf_keywords:
            numCorrectWordsFound += 1

    # Print perfect of correct words found   
    correctPercentage = round((numCorrectWordsFound/len(correctKeywords))*100, 2)
    print(f"{correctPercentage}% of keywords were found.")

    return correctPercentage