# All Data - NLP EDA

In [1]:
# Flag for if you are running this on the sample dataset
# Sample comprises 100 earnings calls (transcripts included)
# Full data comprises 4532 earnings calls (transcripts included)
sample = False
# Modify this path as needed to run on your machine
sample_path = r'~\Box\STAT 222 Capstone\Intermediate Data\all_data_sample.csv'

In [2]:
# Packages
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

In [3]:
# Load in sample csv, or full parquet file
# Use inputted sample path, or ~\Box\STAT 222 Capstone\Intermediate Data\all_data.parquet
if sample:
    df = pd.read_csv(sample_path)
else:
    df = pd.read_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\all_data.parquet') 
df

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ijyli\\Box\\STAT 222 Capstone\\Intermediate Data\\all_data.parquet'

In [None]:
# Basic Statistics
num_records = len(df)
avg_length = df['transcript'].str.len().mean()

print(f"Number of records: {num_records}")
print(f"Average transcript length: {avg_length:.2f} characters")

In [None]:
# Tokenize the text
nltk.download('punkt')  # Download NLTK tokenizer data
tokens = df['transcript'].apply(word_tokenize)

# Remove stop words
nltk.download('stopwords')  # Download NLTK stop words data
stop_words = set(stopwords.words('english'))
tokens = tokens.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words and word.isalpha()])

# Number of sentences
sentences = df['transcript'].apply(nltk.sent_tokenize)
num_sentences = sentences.apply(len)
print('average number of sentences:', num_sentences.mean())

# Sentence length
sentence_lengths = sentences.apply(lambda x: [len(word_tokenize(sentence)) for sentence in x])
sentence_lengths = sentence_lengths.explode()
print('average sentence length:', sentence_lengths.mean())

# Word Frequency Analysis
all_words = [word.lower() for token_list in tokens for word in token_list]
fdist = FreqDist(all_words)
top_words = fdist.most_common(10)
print("Top 10 most common words:")
for word, freq in top_words:
    print(f"{word}: {freq}")

# Plot Word Frequency Distribution
plt.figure(figsize=(10, 6))
fdist.plot(30, title='Top 30 Most Common Words')

In [None]:
# Text Length Distribution
transcript_lengths = df['transcript'].str.len()
plt.figure(figsize=(8, 5))
plt.hist(transcript_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title('Transcript Length Distribution')
plt.xlabel('Transcript Length (characters)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Distribution of Number of Words in earning calls 
def count_words(tokens):
    tokens = [word.lower() for word in tokens if word.isalpha()]
    return len(tokens)

word_tokens = df['transcript'].apply(word_tokenize)
num_words = word_tokens.apply(count_words)
plt.figure(figsize=(8, 5))
plt.hist(num_words, bins=20, color='skyblue', edgecolor='black')
plt.title('Number of Words Distribution')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

print('Average transcript length in words:', num_words.mean())

In [None]:
from wordcloud import WordCloud

# Generate word clouds for each credit rating
ratings = ["AA","BBB","CCC"]
for rating in ratings:

    rating_df = df[df['Rating'] == rating]
    tokens = rating_df['transcript'].apply(word_tokenize)
    stop_words = set(stopwords.words('english'))
    tokens = tokens.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words and word.isalpha()])
    
    all_words = [word.lower() for token_list in tokens for word in token_list]
    fdist = FreqDist(all_words)

    # Generate Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)
    
    # Plot Word Cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Credit Rating {rating}')
    plt.show()

We can see from word cloud for credit rating AA, postive words like "growth" are larger when compared to credit rating CCC.

In [None]:
# Bigrams and trigrams
# Generate for each credit rating
ratings = ["AA","BBB","CCC"]
for rating in ratings:

    rating_df = df[df['Rating'] == rating]

    # tokens = rating_df['transcript'].apply(word_tokenize)
    # stop_words = set(stopwords.words('english'))
    # tokens = tokens.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words and word.isalpha()])
    
    # all_words = [word.lower() for token_list in tokens for word in token_list]

    cv = CountVectorizer(ngram_range=(2,2))
    bigrams = cv.fit_transform(rating_df['transcript'])
    count_values = bigrams.toarray().sum(axis=0)
    ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv.vocabulary_.items()], reverse = True))
    ngram_freq.columns = ["frequency", "ngram"]
    # toss out items if they only contain stop words
    ngram_freq = ngram_freq[ngram_freq['ngram'].apply(lambda x: all(word in stop_words for word in x.split())) == False]
    sns.barplot(x=ngram_freq['frequency'][:10], y=ngram_freq['ngram'][:10])
    plt.title('Top 10 Most Frequently Occuring Bigrams for Rating ' + rating)
    plt.show()

    # cv1 = CountVectorizer(ngram_range=(3,3))
    # trigrams = cv1.fit_transform(rating_df['transcript'])
    # count_values = trigrams.toarray().sum(axis=0)
    # ngram_freq = pd.DataFrame(sorted([(count_values[i], k) for k, i in cv1.vocabulary_.items()], reverse = True))
    # ngram_freq.columns = ["frequency", "ngram"]
    # # toss out items if they only contain stop words
    # ngram_freq = ngram_freq[ngram_freq['ngram'].apply(lambda x: all(word in stop_words for word in x.split())) == False]
    # sns.barplot(x=ngram_freq['frequency'][:10], y=ngram_freq['ngram'][:10])
    # plt.title('Top 10 Most Frequently Occuring Trigrams for Rating ' + rating)
    # plt.show()

Thank yous become far less common as credit rating deteriorates!