In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# read in as df of top 10 companies from 2007-2017 with MDA text
df = pd.read_pickle('Top10_final.pkl')

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()
english_stopwords = set(stopwords.words('english'))

## Sentiment Analysis Clean

In [4]:
def clean_message_sentence(msg):
    '''Function takes in uncleaned MDA text. It removes all punctuation, exluding periods so the text can later be
    analyized by sentence or words. And returns clean text in a list to be used for sentiment analysis.'''
    
    pattern = r'\\x[0-9]*|[^A-Za-z|\.]+'
    letters = ['x','k','s','ex','htm']
    msg = str(msg).lower()
    msg_tokens = nltk.word_tokenize(msg)
    clean_msg_puct = ' '.join([re.sub(pattern,' ',w) for w in msg_tokens]).split()
    clean_msg = [w for w in clean_msg_puct if w not in letters]
    
    return clean_msg

In [5]:
df1 = df.copy()

In [6]:
# go through the df to clean MDA text for sentiment analysis
for year in df:
    for i in range(len(df)):
        df1[year].iloc[i] = df1[year].iloc[i][0],clean_message_sentence(df1[year].iloc[i][1])

In [7]:
# cleaned MDA text for sentiment analysis
df1.to_pickle('Cleaned_MDA_sentences.pkl')

## Word Vectorizing Clean

In [8]:
def clean_message(msg):
    '''Function takes in uncleaned MDA text. It removes stop words, punctuation and digits, and lemmatize the word 
    token. It returns cleaned tecxt to me vectorized.'''
        
    pattern = r'\\x[0-9]*|[^A-Za-z]+'
    letters = ['x','k','s','ex','htm']
    msg = str(msg).lower()
    msg_tokens = nltk.word_tokenize(msg)
    clean_msg_tokens = [w for w in msg_tokens if w not in english_stopwords]
    clean_msg_tokens_puct = ' '.join([re.sub(pattern,' ',w) for w in clean_msg_tokens ]).split()
    clean_msg_tokens_letters = [w for w in clean_msg_tokens_puct if w not in letters]
    lemmatized_token = [wordnet_lemmatizer.lemmatize(w) for w in  clean_msg_tokens_letters]
    
    return lemmatized_token

In [9]:
# go through the df to clean MDA text
for year in df:
    for i in range(len(df)):
        df[year].iloc[i] = df[year].iloc[i][0],clean_message(df[year].iloc[i][1])

In [10]:
# cleaned MDA text w/o all punctuation
df.to_pickle('Cleaned_MDA.pkl')