# Exploratory Data Analysis on Job-Post 

In [1]:
import pandas as pd
import sqlite3
import cufflinks as cf
from textblob import TextBlob
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Reading the data  

In [2]:
con = sqlite3.connect("collectors/data.sqlite3")
df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [3]:
df.head(10)

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist
5,6,Data Scientist,BrainStation,"Vancouver, BC",BrainStation is a global leader in digital ski...,indeed.com,data scientist
6,7,Associate Data Scientist,TrainTurf Academic Solutions Limited,"Vancouver, BC",We are looking for a Data Scientist to support...,indeed.com,data scientist
7,8,Data Scientist I,"AMZN CAN Fulfillment Svcs, ULC","Vancouver, BC","Master or PhD in Computer Science, Machine Lea...",indeed.com,data scientist
8,9,"Data Scientist, AI@Unity",Unity Technologies,"Vancouver, BC",Data is the foundation of our business in AI @...,indeed.com,data scientist
9,10,Data Scientist,Providence Health Care,"Vancouver, BC","Reporting to the Technical Manager, Digital Pr...",indeed.com,data scientist


# EDA Task 1 : To find the top 20 words in the Description before removing the stop keyword

In [4]:
# top 20 words in description before removing stop keyword
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]



common_words = get_top_n_words(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df1.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Red', title='Top 20 words in job-description before removing stop words')

# EDA Task 2 : To find the top 20 words in the Description after removing the stop keyword

In [5]:
# top 20 words in description after removing stop keyword
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df2.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Green', title='Top 20 words in job-description after removing stop words')

# EDA Task 3 : To find the distribution of top bigrams before removing stop words

In [6]:
#The distribution of top bigrams before removing stop words
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df3.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Red', title='Top 20 bigrams in the job-description before removing stop words')

# EDA Task 4 : To find the distribution of top bigrams after removing stop words

In [7]:
#The distribution of top bigrams after removing stop words
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df4.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Green', title='Top 20 bigrams in the job-description after removing stop words')

# EDA Task 5 : To find the distribution of top trigrams before removing stop words

In [8]:
#The distribution of Top trigrams before removing stop words

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df5.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Red', title='Top 20 trigrams in the job-description before removing stop words')


# EDA Task 6 : To find the distribution of top trigrams before removing stop words

In [9]:
#The distribution of Top trigrams after removing stop words

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['description'], 20)
# for word, freq in common_words:
#     print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['description' , 'count'])
df6.groupby('description').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black',colors='Green', title='Top 20 trigrams in the job-description after removing stop words')

In [10]:
#The distribution of top part-of-speech tags of job-description corpus
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
blob = TextBlob(str(df['description']))

pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    colors='Blue',
    title='Top 20 Part-of-speech tagging for description corpus')

[nltk_data] Downloading package punkt to /home/sumukha21/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sumukha21/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
