In [1]:
import re
from nltk.corpus import stopwords as stopwords
import pandas as pd
from nltk.stem import WordNetLemmatizer

In [2]:
#This function returns the percentage of a speech that is included in the n most common words.
def word_frequency(data, n, clean_data = True, remove_stopwords = False):

    transcripts = data['Transcript']
    
    precent_frequency = []
    
    for transcript in transcripts:
        
        if clean_data:
            # Data cleaning
            transcript = transcript.lower()
    
            # Remove all none alphanumeric characters
            transcript = re.sub(r'[^a-zA-Z0-9\s]', '', transcript)
        if remove_stopwords:
            
            # Delete stopwords (commonwords) try both with and without
            stopwords_list = stopwords.words('english')
            transcript = transcript.split()

            resultwords  = [word for word in transcript if word.lower() not in stopwords_list]
            transcript = ' '.join(resultwords)
            
        # This line is redundent if remove_stopwords = True, but can't guarantee that.
        transcript = transcript.split()
        
        # Lemmatize words to remove tense inconsistencies
        Lemmatizer = WordNetLemmatizer()

        transcript = [Lemmatizer.lemmatize(word) for word in transcript]
        
        unique_words = list(set(transcript))

        counts_dict = {word: transcript.count(word) for word in unique_words}
        
        # Sort counts dictionary by most commonly used words and take top n as most frequent words
        #https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
        counts_dict = {k: v for k, v in sorted(counts_dict.items(), key=lambda item: item[1])}
        
        most_common_vals = [v for k,v in counts_dict.items()]

        most_common_vals = most_common_vals[-n:]

        precent_frequency.append(sum(most_common_vals)/len(transcript))
        
    #Convert list to series and return
    return(pd.Series(precent_frequency))