In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from PIL import Image
import requests
import numpy as np
import re
import warnings
import nltk
import collections
from collections import Counter
from collections import Counter, OrderedDict
import itertools
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
warnings.filterwarnings('ignore')

In [None]:
#load doc
df = pd.read_csv(r"C:\Users\fero9\OneDrive\Desktop\Babson\Spring\Advanced Programming\Final Project\Data - Final Copy.csv", skiprows = 2)

In [None]:
df[0:1].fillna(0, inplace = True)

In [None]:
#change first row to integers
df[:1] = df[:1].astype(int)

#transpose matrix and drop rows that are not equal to '0'
df = df.T[df.T[0] != 0]

#transpose matrix again
df = df.T

#drop first row
df = df.iloc[1:,:]

# Word Processer for Word Cloud

In [None]:
#word processor
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
more_stopwords = {'know', 'would', 'going', 'get', 'things', 'need',
                  'want', 're', 't', 's', 'n', 've', 'really', 'one', 'two', 'three'}
STOPWORDS = STOPWORDS.union(more_stopwords)

def preprocess(raw_text):
    #regular expression keeping only letters
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    #convert to lower case and split into words -> convert string into list ( 'hello world' -> ['hello', 'world'])
    words = letters_only_text.lower().split()
    cleaned_words = []
    lemmatizer = PorterStemmer() #plug in here any other stemmer or lemmatiser you want to try out
    
    #remove stopwords
    for word in words:
        if word not in stop_words:
            cleaned_words.append(word)
            
    #remove more stopwords
    for word in words:
        if word not in STOPWORDS:
            cleaned_words.append(word)

    #converting list back to string
    return " ".join(cleaned_words)

# Number of Employees

In [None]:
#create new dataframe, drop null values, and drop ',' and change values from strings to integers
df2 = df.drop(['Organization', 'Revenue', 'Realized_Returns'], axis = 1)
df2 = df2[df2['Number_Of_Employees'].notna()]
df2['Number_Of_Employees'] = df2['Number_Of_Employees'].replace(',','', regex = True).astype(int)

In [None]:
#assign each row a company size (small, medium, or large)
def z(row):
    if row['Number_Of_Employees'] <= 1000:
        val = 'Small'
    elif row['Number_Of_Employees'] >= 3000:
        val = 'Large'
    else:
        val = 'Medium'
    return val

#create new column using the function above
df2['Company Size'] = df2.apply(z, axis = 1)

In [None]:
#move the 'Company Size' column closer to the beginning
companysize = df2['Company Size']
df2.drop(labels = ['Company Size'], axis = 1,inplace = True)
df2.insert(1, 'Company Size', companysize)

### Small Size

In [None]:
#small size rows
df_small_size = df2[df2['Company Size'] == 'Small']
df_small_size = df_small_size.drop(columns = ['Number_Of_Employees', 'Company Size'])
df_small_size.fillna("no", inplace = True)
df_small_size = df_small_size.astype(str)

In [None]:
#put all columns in one
small_size_rows = df_small_size.iloc[:,0]
for i in range(1,len(df_small_size.columns)):
    small_size_rows = pd.concat([small_size_rows, df_small_size.iloc[:,i]])

In [None]:
#word prep
small_size_rows = small_size_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_small_size = [b for l in small_size_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_small_size = collections.Counter(bigrams_list_small_size)

bigram_small_size = pd.DataFrame(bigram_counts_small_size.most_common(20),
                         columns = ['bigram', 'count'])
bigram_small_size

In [None]:
#word counter
counter_list_small_size = Counter(" ".join(small_size_rows).split()).most_common(15)
counter_list_small_size

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
small_size_results = OrderedDict(counter_list_small_size)
plt.bar(small_size_results.keys(), small_size_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of small size companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
small_size = ''
for arg in small_size_rows:
    tokens = arg.split()
    small_size += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(small_size)

    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(small_size, 'Most common words used by CEOs of small size companies')

### Medium Size

In [None]:
#medium size rows
df_medium_size = df2[df2['Company Size'] == 'Medium']
df_medium_size = df_medium_size.drop(columns = ['Number_Of_Employees', 'Company Size'])
df_medium_size.fillna("no", inplace = True)
df_medium_size = df_medium_size.astype(str)

In [None]:
#put all columns in one
medium_size_rows = df_medium_size.iloc[:,0]
for i in range(1,len(df_medium_size.columns)):
    medium_size_rows = pd.concat([medium_size_rows, df_medium_size.iloc[:,i]])

In [None]:
#word prep
medium_size_rows = medium_size_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_medium_size = [b for l in medium_size_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_medium_size = collections.Counter(bigrams_list_medium_size)

bigram_medium_size = pd.DataFrame(bigram_counts_medium_size.most_common(20),
                         columns = ['bigram', 'count'])
bigram_medium_size

In [None]:
#word counter
counter_list_medium_size = Counter(" ".join(medium_size_rows).split()).most_common(15)
counter_list_medium_size

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
medium_size_results = OrderedDict(counter_list_medium_size)
plt.bar(medium_size_results.keys(), medium_size_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of medium size companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
medium_size = ''
for arg in medium_size_rows:
    tokens = arg.split()
    medium_size += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(medium_size)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(medium_size, 'Most common words used by CEOs of medium size companies')

### Large Size

In [None]:
#large size rows
df_large_size = df2[df2['Company Size'] == 'Large']
df_large_size = df_large_size.drop(columns = ['Number_Of_Employees', 'Company Size'])
df_large_size.fillna("no", inplace = True)
df_large_size = df_large_size.astype(str)

In [None]:
#put all columns in one
large_size_rows = df_large_size.iloc[:,0]
for i in range(1,len(df_large_size.columns)):
    large_size_rows = pd.concat([large_size_rows, df_large_size.iloc[:,i]])

In [None]:
#word prep
large_size_rows = large_size_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_large_size = [b for l in large_size_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_large_size = collections.Counter(bigrams_list_large_size)

bigram_large_size = pd.DataFrame(bigram_counts_large_size.most_common(20),
                         columns = ['bigram', 'count'])
bigram_large_size

In [None]:
#word counter
counter_list_large_size = Counter(" ".join(large_size_rows).split()).most_common(15)
counter_list_large_size

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
large_size_results = OrderedDict(counter_list_large_size)
plt.bar(large_size_results.keys(), large_size_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of large size companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
large_size = ''
for arg in large_size_rows:
    tokens = arg.split()
    large_size += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(large_size)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(large_size, 'Most common words used by CEOs of large size companies')

# Revenue

In [None]:
#create new dataframe, drop null values, and drop ',' and change values from float to integers
df3 = df.drop(['Organization', 'Number_Of_Employees', 'Realized_Returns'], axis = 1)
df3 = df3[df3['Revenue'].notna()]
df3['Revenue'] = df3['Revenue'].astype(int)

In [None]:
#assign each row a revenue size (small, medium, or large)
def y(row):
    if row['Revenue'] <= 500:
        value = 'Small'
    elif row['Revenue'] >= 1000:
        value = 'Large'
    else:
        value = 'Medium'
    return value

#create new column using the function above
df3['Revenue Size'] = df3.apply(y, axis = 1)

In [None]:
#move the 'Revenue Size' column closer to the beginning
revenuesize = df3['Revenue Size']
df3.drop(labels = ['Revenue Size'], axis = 1,inplace = True)
df3.insert(1, 'Revenue Size', revenuesize)

### Small Revenue

In [None]:
#small revenue rows
df_small_revenue = df3[df3['Revenue Size'] == 'Small']
df_small_revenue = df_small_revenue.drop(columns = ['Revenue', 'Revenue Size'])
df_small_revenue.fillna("no", inplace = True)
df_small_revenue = df_small_revenue.astype(str)

In [None]:
#put all columns in one
small_revenue_rows = df_small_revenue.iloc[:,0]
for i in range(1,len(df_small_revenue.columns)):
    small_revenue_rows = pd.concat([small_revenue_rows, df_small_revenue.iloc[:,i]])

In [None]:
#word prep
small_revenue_rows = small_revenue_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_small_revenue = [b for l in small_revenue_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_small_revenue = collections.Counter(bigrams_list_small_revenue)

bigram_small_revenue = pd.DataFrame(bigram_counts_small_revenue.most_common(20),
                         columns = ['bigram', 'count'])
bigram_small_revenue

In [None]:
#word counter
counter_list_small_revenue = Counter(" ".join(small_revenue_rows).split()).most_common(15)
counter_list_small_revenue

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
small_revenue_results = OrderedDict(counter_list_small_revenue)
plt.bar(small_revenue_results.keys(), small_revenue_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of small revenue companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
small_revenue = ''
for arg in small_revenue_rows:
    tokens = arg.split()
    small_revenue += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(small_revenue)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(small_revenue, 'Most common words used by CEOs of small revenue companies')

### Medium Revenue

In [None]:
#medium revenue rows
df_medium_revenue = df3[df3['Revenue Size'] == 'Medium']
df_medium_revenue = df_medium_revenue.drop(columns = ['Revenue', 'Revenue Size'])
df_medium_revenue.fillna("no", inplace = True)
df_medium_revenue = df_medium_revenue.astype(str)

In [None]:
#put all columns in one
medium_revenue_rows = df_medium_revenue.iloc[:,0]
for i in range(1,len(df_medium_revenue.columns)):
    medium_revenue_rows = pd.concat([medium_revenue_rows, df_medium_revenue.iloc[:,i]])

In [None]:
#word prep
medium_revenue_rows = medium_revenue_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_medium_revenue = [b for l in medium_revenue_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_medium_revenue = collections.Counter(bigrams_list_medium_revenue)

bigram_medium_revenue = pd.DataFrame(bigram_counts_medium_revenue.most_common(20),
                         columns = ['bigram', 'count'])
bigram_medium_revenue

In [None]:
#word counter
counter_list_medium_revenue = Counter(" ".join(medium_revenue_rows).split()).most_common(15)
counter_list_medium_revenue

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
medium_revenue_results = OrderedDict(counter_list_medium_revenue)
plt.bar(medium_revenue_results.keys(), medium_revenue_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of medium revenue companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
medium_revenue = ''
for arg in medium_revenue_rows:
    tokens = arg.split()
    medium_revenue += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(medium_revenue)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(medium_revenue, 'Most common words used by CEOs of medium revenue companies')

### Large Revenue

In [None]:
#larg revenue rows
df_large_revenue = df3[df3['Revenue Size'] == 'Large']
df_large_revenue = df_large_revenue.drop(columns = ['Revenue', 'Revenue Size'])
df_large_revenue.fillna("no", inplace = True)
df_large_revenue = df_large_revenue.astype(str)

In [None]:
#put all columns in one
large_revenue_rows = df_large_revenue.iloc[:,0]
for i in range(1,len(df_large_revenue.columns)):
    large_revenue_rows = pd.concat([large_revenue_rows, df_large_revenue.iloc[:,i]])

In [None]:
#word prep
large_revenue_rows = large_revenue_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_large_revenue = [b for l in large_revenue_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_large_revenue = collections.Counter(bigrams_list_large_revenue)
bigram_large_revenue = pd.DataFrame(bigram_counts_large_revenue.most_common(20),
                         columns = ['bigram', 'count'])
bigram_large_revenue

In [None]:
#word counter
counter_list_large_revenue = Counter(" ".join(large_revenue_rows).split()).most_common(15)
counter_list_large_revenue

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
large_revenue_results = OrderedDict(counter_list_large_revenue)
plt.bar(large_revenue_results.keys(), large_revenue_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs of large revenue companies', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
large_revenue = ''
for arg in large_revenue_rows:
    tokens = arg.split()
    large_revenue += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(large_revenue)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(large_revenue, 'Most common words used by CEOs of large revenue companies')

# Healthcare?

In [None]:
#create new data frame, drop null values, and assing each row a 'yes' if the company is in the healthcare industry, 0 otherwise
df4 = df.drop(['Revenue', 'Number_Of_Employees', 'Realized_Returns'], axis = 1)
df4 = df4[df4['Organization'].notna()]
df4['Healthcare?'] = np.where(df4['Organization'] == 'Healthcare', 'Yes', 'No')

In [None]:
#move the 'Healthcare?' column closer to the beginning
healthcare = df4['Healthcare?']
df4.drop(labels = ['Healthcare?'], axis = 1,inplace = True)
df4.insert(1, 'Healthcare?', healthcare)

### Yes 

In [None]:
#healthcare yes rows
df_healthcare_yes = df4[df4['Healthcare?'] == 'Yes']
df_healthcare_yes = df_healthcare_yes.drop(columns = ['Organization', 'Healthcare?'])
df_healthcare_yes.fillna("no", inplace = True)
df_healthcare_yes = df_healthcare_yes.astype(str)

In [None]:
#put all columns in one
healthcare_yes_rows = df_healthcare_yes.iloc[:,0]
for i in range(1,len(df_healthcare_yes.columns)):
    healthcare_yes_rows = pd.concat([healthcare_yes_rows, df_healthcare_yes.iloc[:,i]])

In [None]:
#word prep
healthcare_yes_rows = healthcare_yes_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_healthcare_yes = [b for l in healthcare_yes_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_healthcare_yes = collections.Counter(bigrams_list_healthcare_yes)
bigram_healthcare_yes = pd.DataFrame(bigram_counts_healthcare_yes.most_common(20),
                         columns = ['bigram', 'count'])
bigram_healthcare_yes

In [None]:
#word counter
counter_list_healthcare_yes = Counter(" ".join(healthcare_yes_rows).split()).most_common(15)
counter_list_healthcare_yes

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
healthcare_yes_results = OrderedDict(counter_list_healthcare_yes)
plt.bar(healthcare_yes_results.keys(), healthcare_yes_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs in the healthcare industry', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
healthcare_yes = ''
for arg in healthcare_yes_rows:
    tokens = arg.split()
    healthcare_yes += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(healthcare_yes)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(healthcare_yes, 'Most common words used by CEOs of companies in the healthcare industry')

### No

In [None]:
#healthcare no rows
df_healthcare_no = df4[df4['Healthcare?'] == 'No']
df_healthcare_no = df_healthcare_no.drop(columns = ['Organization', 'Healthcare?'])
df_healthcare_no.fillna("no", inplace = True)
df_healthcare_no = df_healthcare_no.astype(str)

In [None]:
#put all columns in one
healthcare_no_rows = df_healthcare_no.iloc[:,0]
for i in range(1,len(df_healthcare_no.columns)):
    healthcare_no_rows = pd.concat([healthcare_no_rows, df_healthcare_no.iloc[:,i]])

In [None]:
#word prep
healthcare_no_rows = healthcare_no_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_healthcare_no = [b for l in healthcare_no_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_healthcare_no = collections.Counter(bigrams_list_healthcare_no)
bigram_healthcare_no = pd.DataFrame(bigram_counts_healthcare_no.most_common(20),
                         columns = ['bigram', 'count'])
bigram_healthcare_no

In [None]:
#word counter
counter_list_healthcare_no = Counter(" ".join(healthcare_no_rows).split()).most_common(15)
counter_list_healthcare_no

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
healthcare_no_results = OrderedDict(counter_list_healthcare_no)
plt.bar(healthcare_no_results.keys(), healthcare_no_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs in other industries', fontsize = 15, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
healthcare_no = ''
for arg in healthcare_no_rows:
    tokens = arg.split()
    healthcare_no += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color ='black',
                          min_font_size = 10).generate(healthcare_no)
  
    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(healthcare_no, 'Most common words used by CEOs of companies in other industries')

# Realized Returns

In [None]:
#create new dataframe and drop null values
df5 = df.drop(['Organization', 'Revenue', 'Number_Of_Employees'], axis = 1)
df5 = df5[df5['Realized_Returns'].notna()]

In [None]:
#assign each row a 'Yes' or 'No' based on if they have had more than one realized return
def z(row):
    if '\n' in row['Realized_Returns']:
        val = 'Yes'
    else:
        val = 'No'
    return val

#create new column using the function above
df5['More than One Realized Return?'] = df5.apply(z, axis = 1)

In [None]:
#move the 'More than One Realized Return?' column closer to the beginning
realizedreturns = df5['More than One Realized Return?']
df5.drop(labels = ['More than One Realized Return?'], axis = 1, inplace = True)
df5.insert(1, 'More than One Realized Return?', realizedreturns)

### 1 Realized Return

In [None]:
#one realized return rows
df_one_realized_return = df5[df5['More than One Realized Return?'] == 'No']
df_one_realized_return = df_one_realized_return.drop(columns = ['Realized_Returns', 'More than One Realized Return?'])
df_one_realized_return.fillna("no", inplace = True)
df_one_realized_return = df_one_realized_return.astype(str)

In [None]:
#put all columns in one
one_realized_return_rows = df_one_realized_return.iloc[:,0]
for i in range(1,len(df_one_realized_return.columns)):
    one_realized_return_rows = pd.concat([one_realized_return_rows, df_one_realized_return.iloc[:,i]])

In [None]:
#word prep
one_realized_return_rows = one_realized_return_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_one_realized_return = [b for l in one_realized_return_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_one_realized_return = collections.Counter(bigrams_list_one_realized_return)

bigram_one_realized_return = pd.DataFrame(bigram_counts_one_realized_return.most_common(20),
                         columns = ['bigram', 'count'])
bigram_one_realized_return

In [None]:
#word counter
counter_list_one_realized_return = Counter(" ".join(one_realized_return_rows).split()).most_common(15)
counter_list_one_realized_return

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
one_realized_returns_result = OrderedDict(counter_list_one_realized_return)
plt.bar(one_realized_returns_result.keys(), one_realized_returns_result.values(), color = '#FF0000')
plt.title('Most common words used by CEOs who have had 1 realized return',
          fontsize = 10, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
one_realized_return = ''
for arg in one_realized_return_rows:
    tokens = arg.split()
    one_realized_return += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(one_realized_return)

    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(one_realized_return, 'Most common words used by CEOs who have had 1 realized return')

### More than 1 Realized Return

In [None]:
#realized returns rows
df_realized_returns = df5[df5['More than One Realized Return?'] == 'Yes']
df_realized_returns = df_realized_returns.drop(columns = ['Realized_Returns', 'More than One Realized Return?'])
df_realized_returns.fillna("no", inplace = True)
df_realized_returns = df_realized_returns.astype(str)

In [None]:
#put all columns in one
realized_returns_rows = df_realized_returns.iloc[:,0]
for i in range(1,len(df_realized_returns.columns)):
    realized_returns_rows = pd.concat([realized_returns_rows, df_realized_returns.iloc[:,i]])

In [None]:
#word prep
realized_returns_rows = realized_returns_rows.apply(preprocess)

In [None]:
#bigram
bigrams_list_realized_returns = [b for l in realized_returns_rows for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_counts_realized_returns = collections.Counter(bigrams_list_realized_returns)

bigram_realized_returns = pd.DataFrame(bigram_counts_realized_returns.most_common(20),
                         columns = ['bigram', 'count'])
bigram_realized_returns

In [None]:
#word counter
counter_list_realized_returns = Counter(" ".join(realized_returns_rows).split()).most_common(15)
counter_list_realized_returns

In [None]:
#plot most common words
fig, ax = plt.subplots(figsize = (8, 8))
realized_returns_results = OrderedDict(counter_list_realized_returns)
plt.bar(realized_returns_results.keys(), realized_returns_results.values(), color = '#FF0000')
plt.title('Most common words used by CEOs who have had more than 1 realized return',
          fontsize = 10, fontweight = 'bold', loc = 'center')
plt.xticks(rotation = 90)
plt.show()

In [None]:
#word cloud
realized_returns = ''
for arg in realized_returns_rows:
    tokens = arg.split()
    realized_returns += " ".join(tokens) + " "

def word_cloud(data, title):
    wordcloud = WordCloud(width = 700,
                          height = 700,
                          stopwords = STOPWORDS,
                          background_color = 'black',
                          min_font_size = 10).generate(realized_returns)

    #plot the WordCloud image                        
    plt.figure(figsize = (4.5, 4.5), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title, fontsize = 13)
    plt.tight_layout(pad = 0)
    plt.show()

word_cloud(realized_returns, 'Most common words used by CEOs who have had more than 1 realized return')