In [14]:
import requests
import numpy as np
import os
import json
import plotly.express as px
import pandas as pd
pd.set_option('display.max_colwidth', 400)

import re

# NLP
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')   # Required for tokenization
nltk.download('wordnet') # Required for lemmatization
nltk.download('stopwords') # Required for lemmatization

from nltk.corpus import stopwords

from wordcloud import WordCloud
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Instantiate
lemmatizer = WordNetLemmatizer()

# Exclusion list of punctuations and numbers
exclist = string.punctuation + string.digits


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daire\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daire\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daire\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
metaData = pd.read_excel("MetaOfEarningCallsV2.xlsx")

In [12]:
# Create a Function
def clean_texts(text):
    """ Function to perform preprocessing """

    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Convert to lower cases
    text = text.lower()
    
    # Remove punctuations and numbers
    text = text.translate(str.maketrans("", "", exclist))
    
    # Replace certain words
    text = text.replace("leased", "lease")
    
    # Tokenization
    tokens = word_tokenize(text)
        
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    # Join tokens
    clean_text = " ".join(tokens)
    
    # Return the output
    return clean_text


In [15]:
metaData["cleanText"] = metaData["Text"].progress_apply(clean_texts)

  0%|          | 0/7379 [00:00<?, ?it/s]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("../resources/stopwords.txt")
docs=metaData["cleanText"].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
cv=CountVectorizer(max_df=0.8,stop_words=list(stopwords))
word_count_vector=cv.fit_transform(docs)

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [18]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results


In [21]:
#generate tf-idf for all documents in your list. docs_test has 500 documents
tf_idf_vector=tfidf_transformer.transform(cv.transform(metaData["cleanText"].tolist()))
feature_names=cv.get_feature_names_out()

results=[]
for i in range(tf_idf_vector.shape[0]):
    
    # get vector for a single document
    curr_vector=tf_idf_vector[i]
    
    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,20)
    
    
    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['cleanText','keywords_vals'])
data=metaData.merge(df, on="cleanText", how = "inner")


In [23]:
data["Keywords"] = data["keywords_vals"].progress_apply(lambda x: list(x.keys()))
flat_list = [item for sublist in list(data["Keywords"]) for item in sublist]

common_words = pd.DataFrame(Counter(flat_list).most_common(60))
common_words.columns = ('Word', 'Count')

  0%|          | 0/7379 [00:00<?, ?it/s]

In [24]:
data

Unnamed: 0.1,Unnamed: 0,gltr_0,gltr_1,gltr_2,gltr_3,gltr_Label,gpt_zero_Burtiness,gpt_zero_Label,gpt_zero_Perplexity,roberta_Label,roberta_large_Label,roberta_large_score_0,roberta_large_score_1,roberta_score_Fake,roberta_score_Real,Text,cleanText,keywords_vals,Keywords
0,0,0.748011,0.188329,0.055703,0.007958,1,70,1,16,0,0,0.932329,0.067671,0.000259,0.999741,"Thank you. Good afternoon and thanks for joining us. Speaking today is Apple CFO Peter Oppenheimer and he’ll be joined by Apple COO Tim Cook and Treasurer Gary Wipfler for the Q&A session with analysts. Please note that some of the information you’ll hear during our discussion today may consist of forward looking statements regarding revenue, gross margin, operating expenses, other income and ...",thank good afternoon thanks joining u speaking today apple cfo peter oppenheimer joined apple coo tim cook treasurer gary wipfler q session analyst please note information hear discussion today may consist forward looking statement regarding revenue gross margin operating expense income expense stock based compensation expense tax earnings actual result trend could differ materially forecast i...,"{'apple': 0.338, 'oppenheimer': 0.242, 'peter': 0.196, 'revenue': 0.187, 'thank': 0.162, 'december': 0.142, 'wa': 0.14, 'today': 0.139, 'form': 0.136, 'joining': 0.134, 'margin': 0.134, 'gross': 0.131, 'wipfler': 0.127, 'treasurer': 0.124, 'expense': 0.122, 'attached': 0.122, 'statement': 0.121, 'earnings': 0.119, 'gary': 0.113, 'breaking': 0.109}","[apple, oppenheimer, peter, revenue, thank, december, wa, today, form, joining, margin, gross, wipfler, treasurer, expense, attached, statement, earnings, gary, breaking]"
1,1,0.692090,0.206215,0.076271,0.025424,0,240,0,22,0,0,0.997073,0.002927,0.000172,0.999828,"approximately two times net income. I’d like to first talk about our Mac products and services, which represented 47% of total quarterly revenue. We are extremely pleased to have shipped 2.32 million Macs, exceeding the previous December quarter shipments by over 700,000 units and representing 44% year over year growth. That’s more than 2.5 times the overall market rate of growth for the Decem...",approximately two time net income like first talk mac product service represented total quarterly revenue extremely please shipped million mac exceeding previous december quarter shipment unit representing year year growth time overall market rate growth december quarter based latest forecast published idc sale new imac announced august continued robust helping drive year year growth desktop s...,"{'ipod': 0.498, 'leopard': 0.365, 'mac': 0.26, 'extremely': 0.145, 'music': 0.144, 'tiger': 0.138, 'began': 0.131, 'channel': 0.119, 'sold': 0.119, 'wifi': 0.117, 'december': 0.117, 'inventory': 0.113, 'mainstream': 0.113, 'reviewer': 0.108, 'ended': 0.108, 'total': 0.105, 'revenue': 0.103, 'million': 0.1, 'surpassing': 0.099, 'imac': 0.099}","[ipod, leopard, mac, extremely, music, tiger, began, channel, sold, wifi, december, inventory, mainstream, reviewer, ended, total, revenue, million, surpassing, imac]"
2,2,0.708333,0.213889,0.061111,0.016667,1,229,1,21,0,0,0.993504,0.006496,0.056833,0.943167,"mobile applications. Because of the higher cost associated with the large touch screen and more powerful processor required to run applications like Safari, this was the most expensive iPod we’ve brought to market for some time. So we had the challenge of establishing a completely new type of iPod at the top of the line at a price point above where we’ve been for quite some time, and we succee...",mobile application higher cost associated large touch screen powerful processor required run application like safari wa expensive ipod brought market time challenge establishing completely new type ipod top line price point quite time succeeded addition selling successfully ipod touch wa responsible overall increase ipod asp drove revenue year year last week announced major software upgrade ip...,"{'ipod': 0.495, 'iphone': 0.245, 'touch': 0.208, 'application': 0.187, 'according': 0.171, 'screen': 0.159, 'iphones': 0.145, 'wa': 0.138, 'million': 0.124, 'gorgeous': 0.124, 'rental': 0.119, 'succeeded': 0.117, 'rave': 0.117, 'december': 0.116, 'safari': 0.115, 'store': 0.115, 'apple': 0.111, 'market': 0.107, 'mobile': 0.102, 'revenue': 0.102}","[ipod, iphone, touch, application, according, screen, iphones, wa, million, gorgeous, rental, succeeded, rave, december, safari, store, apple, market, mobile, revenue]"
3,3,0.726064,0.164894,0.077128,0.031915,1,1699,0,23,0,0,0.995742,0.004258,0.000207,0.999793,"year ago quarter. We opened our third store in Manhattan on West 14th street, which is off to a great start and devotes an entire floor to the Genius Bar, personal training, and pro lab. We opened six other new stores during the quarter, ending with 204 stores. With an average of 201 stores open during the quarter, average revenue per store was $8.5 million compared to $6.6 million in the year...",year ago quarter opened third store manhattan west th street great start devotes entire floor genius bar personal training pro lab opened six new store quarter ending store average store open quarter average revenue per store wa million compared million year ago quarter store sold record mac quarter representing year year growth customer buying mac store quarter new mac store traffic hit new r...,"{'store': 0.639, 'visitor': 0.232, 'training': 0.225, 'genius': 0.2, 'million': 0.17, 'mac': 0.166, 'personal': 0.162, 'opened': 0.135, 'traffic': 0.127, 'devotes': 0.118, 'new': 0.116, 'software': 0.111, 'representing': 0.111, 'concierge': 0.109, 'manhattan': 0.104, 'leopard': 0.104, 'wa': 0.098, 'stronger': 0.095, 'customer': 0.094, 'capitalized': 0.091}","[store, visitor, training, genius, million, mac, personal, opened, traffic, devotes, new, software, representing, concierge, manhattan, leopard, wa, stronger, customer, capitalized]"
4,0,0.770889,0.159030,0.064690,0.005391,1,69,1,15,0,0,0.979467,0.020533,0.002525,0.997475,"Thank you. Good afternoon and thanks to everyone for joining us. Speaking today is Apple CFO Peter Oppenheimer and he’ll be joined by Apple COO Tim Cook and Treasurer Gary Wipfler for the Q&A session with analysts. Please note that some of the information you will hear during our discussion today may consist of forward looking statements regarding revenue, gross margin, operating expenses, oth...",thank good afternoon thanks everyone joining u speaking today apple cfo peter oppenheimer joined apple coo tim cook treasurer gary wipfler q session analyst please note information hear discussion today may consist forward looking statement regarding revenue gross margin operating expense income expense stock based compensation expense tax earnings future product actual result trend could diff...,"{'march': 0.327, 'apple': 0.325, 'oppenheimer': 0.233, 'form': 0.197, 'peter': 0.189, 'revenue': 0.18, 'thank': 0.156, 'wa': 0.135, 'today': 0.134, 'joining': 0.129, 'margin': 0.129, 'gross': 0.127, 'wipfler': 0.122, 'treasurer': 0.12, 'expense': 0.117, 'attached': 0.117, 'statement': 0.116, 'earnings': 0.115, 'popularity': 0.115, 'gary': 0.109}","[march, apple, oppenheimer, form, peter, revenue, thank, wa, today, joining, margin, gross, wipfler, treasurer, expense, attached, statement, earnings, popularity, gary]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7374,1,0.646900,0.194070,0.123989,0.035040,0,1954,0,41,0,0,0.999348,0.000652,0.000299,0.999701,"with you information about the company and our plans on more of a real time basis. With that, I'll hand it over to Darren. Thanks Jennifer. Good morning and thanks for joining us today. Before covering our 2022 results, I want to start by recognizing our people. Their hard work and commitment, not only to the company, but to meeting the critical needs of society, are what drove the strong resu...",information company plan real time basis hand darren thanks jennifer good morning thanks joining u today covering result want start recognizing people hard work commitment company meeting critical need society drove strong result reported morning work easy whether achieving industry leading safety driving record level environmental performance increasing production offsetting run away inflatio...,"{'leaned': 0.282, 'organizational': 0.205, 'leading': 0.17, 'profit': 0.161, 'undersupplied': 0.157, 'bucking': 0.157, 'industry': 0.15, 'work': 0.147, 'wisdom': 0.145, 'pandemic': 0.145, 'result': 0.142, 'expropriation': 0.141, 'cyclically': 0.138, 'delivered': 0.117, 'morning': 0.116, 'darren': 0.114, 'storm': 0.114, 'jennifer': 0.112, 'recovering': 0.111, 'thanks': 0.109}","[leaned, organizational, leading, profit, undersupplied, bucking, industry, work, wisdom, pandemic, result, expropriation, cyclically, delivered, morning, darren, storm, jennifer, recovering, thanks]"
7375,2,0.664850,0.198910,0.103542,0.032698,0,204,0,36,0,0,0.936033,0.063967,0.000908,0.999092,"priorities. Importantly, we’ve continued to strengthen our industry leading portfolio and increased production from high return, advantaged assets in Guyana and the Permian at a time when the world needed it most. We implemented a series of organizational changes to further leverage our scale and integration, improve effectiveness, and better serve our customers. We combined our downstream and...",priority importantly continued strengthen industry leading portfolio increased production high return advantaged asset guyana permian time world needed implemented series organizational change leverage scale integration improve effectiveness better serve customer combined downstream chemical company form product solution world largest fuel chemical lubricant business new integrated business fo...,"{'emission': 0.425, 'sustainability': 0.184, 'advantaged': 0.181, 'carbon': 0.164, 'capture': 0.144, 'operation': 0.139, 'permian': 0.138, 'plan': 0.136, 'incentivizes': 0.134, 'high': 0.127, 'chemical': 0.123, 'opportunity': 0.122, 'passage': 0.119, 'advantage': 0.118, 'portfolio': 0.117, 'decarbonize': 0.117, 'permanently': 0.115, 'greenhouse': 0.11, 'business': 0.109, 'hydrogen': 0.108}","[emission, sustainability, advantaged, carbon, capture, operation, permian, plan, incentivizes, high, chemical, opportunity, passage, advantage, portfolio, decarbonize, permanently, greenhouse, business, hydrogen]"
7376,3,0.673025,0.196185,0.100817,0.029973,0,251,1,34,0,0,0.999045,0.000955,0.000248,0.999752,"approximately $5 billion in divestments of non core assets. CapEx was in line with our guidance. To further increase transparency, in 2022, we introduced three new reports; The Lobbying Report, which provides additional disclosure of our lobbying activities and expenditures; The Climate Lobbying Report, which provides details on our US activities at the federal and state level; and our Investi...",approximately billion divestments non core asset capex wa line guidance increase transparency introduced three new report lobbying report provides additional disclosure lobbying activity expenditure climate lobbying report provides detail u activity federal state level investing people supplement addition updated sustainability report lastly mentioned hard work people underpinned success past ...,"{'lobbying': 0.448, 'report': 0.25, 'industry': 0.19, 'billion': 0.166, 'strategy': 0.165, 'structural': 0.164, 'nearly': 0.148, 'activity': 0.135, 'fortifying': 0.134, 'cyclically': 0.132, 'roce': 0.127, 'success': 0.122, 'honor': 0.12, 'investing': 0.12, 'attracting': 0.111, 'underpinned': 0.11, 'confirmed': 0.11, 'leading': 0.108, 'past': 0.107, 'shareholder': 0.107}","[lobbying, report, industry, billion, strategy, structural, nearly, activity, fortifying, cyclically, roce, success, honor, investing, attracting, underpinned, confirmed, leading, past, shareholder]"
7377,4,0.689840,0.189840,0.088235,0.032086,0,1294,0,30,0,0,0.998707,0.001293,0.002208,0.997792,"growing dividend. In 2022, we boosted the quarterly dividend by more than 3% and marked the 40th consecutive annual increase. Additionally, we increased our share repurchase program twice during the year. In total, we returned $30 billion to shareholders in 2022, including about $15 billion in dividends, which also led peers. These actions reflect the confidence we have in our strategy, the pe...",growing dividend boosted quarterly dividend marked th consecutive annual increase additionally increased share repurchase program twice year total returned billion shareholder including billion dividend also led peer action reflect confidence strategy performance seen across business strength company future proud people work meet evolving need society advance strategy equation committed sharin...,"{'emission': 0.41, 'company': 0.226, 'scope': 0.181, 'zero': 0.167, 'approach': 0.162, 'dividend': 0.157, 'society': 0.155, 'includes': 0.143, 'updated': 0.142, 'mackenzie': 0.125, 'avoids': 0.125, 'cycle': 0.123, 'specific': 0.12, 'iea': 0.116, 'reduce': 0.115, 'life': 0.115, 'recycling': 0.115, 'report': 0.113, 'plastic': 0.111, 'equation': 0.111}","[emission, company, scope, zero, approach, dividend, society, includes, updated, mackenzie, avoids, cycle, specific, iea, reduce, life, recycling, report, plastic, equation]"
