# Find Stop Words
This notebook contains the functions to find the most common words across all SanJose# files. 
***

### Functions

1.  Find Stop Words
    * getListKeywords(pathToFile)
    * updateDict(words, wordsDict)
    * findCommonWords(pathToKeywordsFolder) 
    * saveCommonWords(pathToStopWordsFolder, pathToKeywordsFolder)
    * getCommonWords(pathToCommonWordsCsv)  

***

In [1]:
"""
Get list of words from keywords csv
"""
def getListKeywords(pathToFile: str) -> list:
    df = pd.read_csv(pathToFile)
    words = list(df['Keywords'])
    return words

In [2]:
"""
Update frequency of words in hash map
"""
def updateDict(words, wordsDict):
    for word in words:
        if word not in wordsDict:
            wordsDict[word] = 1 #add word
        else: # if word is in dict
            wordsDict[word] += 1 #increase frequency
    return wordsDict

In [3]:
"""
Find the most common words in all SanJose# files
"""
def findCommonWords(pathToKeywordsFolder):
    # Names of all files of keywords csv
    files = os.listdir(pathToKeywordsFolder)

    # Count frequency of words across all files
    wordsDict = {} # hash map with key=word, value=frequency
    for filename in files:
        # Get all words in one file
        pathToFile = pathToKeywordsFolder + filename
        words = getListKeywords(pathToFile)
        wordsDict = updateDict(words, wordsDict)
    
    # Create dataframe from dict
    wordsFrequency = pd.DataFrame(wordsDict.items(), 
                                  columns=['Word', 'Frequency'])
    wordsFrequency = wordsFrequency.sort_values('Frequency', ascending=False)
    wordsFrequency = wordsFrequency.reset_index(drop=True)
    
    return wordsFrequency

In [4]:
"""
Save file with words and their frequency
"""
def saveCommonWords(pathToStopWordsFolder, pathToKeywordsFolder):
    filename = "CommonWords.csv"
    pathToFile = pathToStopWordsFolder + filename
    df_commonWords = findCommonWords(pathToKeywordsFolder)
    # Save df as .csv file
    df_commonWords.to_csv(pathToFile, index=False)
    print(f"Saved as {filename}")
    return df_commonWords

In [5]:
"""
Get list of words in descending order of frequency (more to less)
"""
def getCommonWords(pathToCommonWordsCsv):
    try: 
        df = pd.read_csv(pathToCommonWordsCsv)
        commonWords = list(df['Word'])
        return commonWords
    except: 
        
        print("Could not find file for common words.")
        print(f"Given file path was {pathToCommonWordsCsv}.")
        return []