In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
df = pd.read_csv('Linkedin_Group_Members-03__Aug_2021-15_00_04.csv')

In [15]:
job_list = ['Student', 'Full', 'Back', 'Front', 'Mobile', 'Hardware', 'Software', 
            'Web','BI', 'DBA', 'Analyst', 'UI','UX', 'Graphics', 'QA',
            'HR', 'CEO', 'CTO', 'Founder']

buzz_list = ['Data', 'AI', 'ML', 
        'Junior', 'Senior', 'Lead',
        'Looking', 'Seeking']

job_count = [df.occupation.str.count(job).sum() for job in job_list]
buzz_count = [df.occupation.str.count(buzz).sum() for buzz in buzz_list]

df_i = pd.DataFrame({'job':job_list,'count':job_count })
df_i.sort_values(by='count', ascending=False, inplace=True)
df_i.reset_index(inplace=True, drop=True)

df_b = pd.DataFrame({'buzz_word':buzz_list,'count':buzz_count })
df_b.sort_values(by='count', ascending=False, inplace=True)
df_b.reset_index(inplace=True, drop=True)


In [16]:
fig, ax = plt.subplots(figsize = (10, 5))
plt.grid()
plt.barh(width=df_i['count'], y=df_i['job'], color ='green', align='center')
 
plt.xlabel("Count")
plt.ylabel("Term")
plt.title("Terms Included in Job Title")

ax.invert_yaxis()  # labels read top-to-bottom


plt.savefig('top_terms_job.png', bbox_inches='tight')
plt.close(fig)
#plt.show()

In [17]:
fig, ax = plt.subplots(figsize = (10, 5))
plt.grid()
plt.barh(width=df_b['count'], y=df_b['buzz_word'], color ='green', align='center')
 
plt.xlabel("Count")
plt.ylabel("Buzz Word")
plt.title("Buzz Words Included in Job Title")

ax.invert_yaxis() 

plt.savefig('top_terms_buzz.png', bbox_inches='tight')
plt.close(fig)
#plt.show()

In [22]:
fig, ax = plt.subplots(figsize = (7, 7))

text = " ".join(job for job in df.occupation)
print ("There are {} words in the combination of all review.".format(len(text)))
# Create stopword list:
stopwords = set(STOPWORDS)
#stopwords.update(["drink", "now", "wine", "flavor", "flavors"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
plt.axis("off")


plt.imshow(wordcloud, interpolation='bilinear')
plt.savefig('top_terms_wc1.png', bbox_inches='tight')
plt.close(fig)

#plt.show()

There are 129559 words in the combination of all review.


In [23]:
mylist = df[['occupation']].stack().values

In [32]:
line = " ".join(mylist)
line = re.sub('[.@,\-|/()\'\"\t!*;:_]', '', line)
line = re.sub('[&]', ' ', line)
line = str(line.encode('ascii', errors='ignore'))

line = re.sub(' +', ' ', line)

In [26]:
#Populate list of words that are no in the stopwords set
tokenWithoutStopwords = []
for word in line.split():
    if word not in stopwords:
        tokenWithoutStopwords.append(word.lower())

stringWithoutStopwords = ' '.join(map(str, tokenWithoutStopwords))

In [29]:
docSplit = stringWithoutStopwords.split()

# Constructs a frequency distribution over the words in the document
freq = nltk.FreqDist(w for w in docSplit)


# Finds the top x most frequently used words in the document
topWords = freq.most_common(30)



In [37]:
## Conversion to Pandas series via Python Dictionary for easier plotting
all_fdist = pd.Series(dict(topWords))

## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(7,7))

## Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(y=all_fdist.index, x=all_fdist.values, palette="Blues_d", ax=ax)
plt.grid()
plt.title('Count of Top Words in Titles')
plt.savefig('top_words_word_count.png', bbox_inches='tight')
plt.close(fig)

#plt.show()

In [36]:
## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(7,7))

stopwords = set(STOPWORDS)

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False).generate(stringWithoutStopwords)

plt.axis("off")
#plt.tight_layout(pad=0)

plt.imshow(wordcloud, interpolation='bilinear')
plt.savefig('top_words_wc2_single_words.png', bbox_inches='tight')
plt.close(fig)

#plt.show()
