In [1]:
import pandas as pd
#import nltk
from matplotlib import pyplot as plt
from scrape_data import *
from process_text import *
from helper import *

In [2]:
# Initialize the dict to store all text lists for below titles
text_lists = {}
titles = ['Data Scientist', 'Machine Learning Engineer', 'Data Engineer']
# Grab the tokens list and store them in the dict
for title in titles:
    text_lists[title] = plot_profile(title=title, first_n_postings=120, return_text_list=True)

In [3]:
# Make the dict of skills to investigate

languages = ['Python', 'R', 'SQL', 'Java', 'C', 'C++', 'C#', 'Scala', 'Perl', 'Julia', 
             'Javascript', 'HTML', 'CSS', 'PHP', 'Ruby', 'Lua', 'MATLAB', 'SAS'] 

big_data = ['Hadoop', 'MapReduce', 'Hive', 'Pig', 'Cascading', 'Scalding', 'Cascalog', 'HBase', 'Sqoop', 
            'Mahout', 'Oozie', 'Flume', 'ZooKeeper', 'Spark', 'Storm', 'Shark', 'Impala', 'Elasticsearch', 
            'Kafka', 'Flink', 'Kinesis', 'Presto', 'Hume', 'Airflow', 'Azkabhan', 'Luigi', 'Cassandra']

dl = ['TensorFlow', 'Keras', 'PyTorch', 'Theano', 'Deeplearning4J', 'Caffe', 'TFLearn', 'Torch', 
      'OpenCV', 'MXNet', 'Microsoft Cognitive Toolkit', 'Lasagne']

cloud = ['AWS', 'GCP', 'Azure']

ml = ['Natural Language Processing', 'Computer Vision', 'Speech Recognition', 'Fraud Detection',
      'Recommender System', 'Image Recognition', 'Object Dectection', 'Chatbot',  'Sentiment Analysis']

visualization = ['Dimple', 'D3.js', 'Ggplot', 'Shiny', 'Plotly', 'Matplotlib', 'Seaborn', 
                'Bokeh', 'Tableau']

other = ['Pandas', 'Numpy', 'Scipy', 'Sklearn', 'Scikit-Learn', 'Docker', 'Git', 'Jira', 'Kaggle']

dict_to_check = {'Programming Languages': languages,
                 'Big Data Technologies': big_data,
                 'Deep Learning Frameworks': dl,
                 'Cloud Computing Platforms': cloud,
                 'Machine Learning Application': ml,
                 'Visualization Tools': visualization,
                 'Other': other}

In [4]:
# Check the frequency and store in dict
freq_dict = {}
for title in text_lists.keys():
    freq_dict[title] = check_freq(dict_to_check=dict_to_check, text_list=text_lists[title])

In [5]:
# Convert the dict to a pandas df
df = pd.DataFrame.from_dict({(i,j): freq_dict[i][j] 
                             for i in freq_dict.keys()
                             for j in freq_dict[i].keys()},
                            orient='index')
df.head()

Unnamed: 0,Unnamed: 1,Python,R,SQL,Java,C,C++,C#,Scala,Perl,Julia,...,Tableau,Pandas,Numpy,Scipy,Sklearn,Scikit-Learn,Docker,Git,Jira,Kaggle
Data Engineer,Big Data Technologies,,,,,,,,,,,...,,,,,,,,,,
Data Engineer,Cloud Computing Platforms,,,,,,,,,,,...,,,,,,,,,,
Data Engineer,Deep Learning Frameworks,,,,,,,,,,,...,,,,,,,,,,
Data Engineer,Machine Learning Application,,,,,,,,,,,...,,,,,,,,,,
Data Engineer,Other,,,,,,,,,,,...,,3.0,0.0,2.0,0.0,0.0,8.0,29.0,1.0,0.0


In [6]:
# Reset the index to include both title and category as columns
df = df.reset_index()
df.head()

Unnamed: 0,level_0,level_1,Python,R,SQL,Java,C,C++,C#,Scala,...,Tableau,Pandas,Numpy,Scipy,Sklearn,Scikit-Learn,Docker,Git,Jira,Kaggle
0,Data Engineer,Big Data Technologies,,,,,,,,,...,,,,,,,,,,
1,Data Engineer,Cloud Computing Platforms,,,,,,,,,...,,,,,,,,,,
2,Data Engineer,Deep Learning Frameworks,,,,,,,,,...,,,,,,,,,,
3,Data Engineer,Machine Learning Application,,,,,,,,,...,,,,,,,,,,
4,Data Engineer,Other,,,,,,,,,...,,3.0,0.0,2.0,0.0,0.0,8.0,29.0,1.0,0.0


In [7]:
# Rename the first two columns
df.rename({'level_0': 'title', 'level_1': 'category'}, axis='columns', inplace=True)
df.head()

Unnamed: 0,title,category,Python,R,SQL,Java,C,C++,C#,Scala,...,Tableau,Pandas,Numpy,Scipy,Sklearn,Scikit-Learn,Docker,Git,Jira,Kaggle
0,Data Engineer,Big Data Technologies,,,,,,,,,...,,,,,,,,,,
1,Data Engineer,Cloud Computing Platforms,,,,,,,,,...,,,,,,,,,,
2,Data Engineer,Deep Learning Frameworks,,,,,,,,,...,,,,,,,,,,
3,Data Engineer,Machine Learning Application,,,,,,,,,...,,,,,,,,,,
4,Data Engineer,Other,,,,,,,,,...,,3.0,0.0,2.0,0.0,0.0,8.0,29.0,1.0,0.0


In [8]:
value_vars = df.columns.tolist()[2:] # the list of column names except the first two
# Transform from wide to long for plotting
df = pd.melt(df, id_vars=['title', 'category'], value_vars=value_vars)
df.head()

Unnamed: 0,title,category,variable,value
0,Data Engineer,Big Data Technologies,Python,
1,Data Engineer,Cloud Computing Platforms,Python,
2,Data Engineer,Deep Learning Frameworks,Python,
3,Data Engineer,Machine Learning Application,Python,
4,Data Engineer,Other,Python,


In [9]:
# Rename the last two columns
df.rename({'variable': 'skill', 'value': 'frequency'}, axis='columns', inplace=True)
df.head()

Unnamed: 0,title,category,skill,frequency
0,Data Engineer,Big Data Technologies,Python,
1,Data Engineer,Cloud Computing Platforms,Python,
2,Data Engineer,Deep Learning Frameworks,Python,
3,Data Engineer,Machine Learning Application,Python,
4,Data Engineer,Other,Python,


In [10]:
# Subset to non null values in the freq column
df = df[df['frequency'].notnull()]
df.head()

Unnamed: 0,title,category,skill,frequency
5,Data Engineer,Programming Languages,Python,52.0
12,Data Scientist,Programming Languages,Python,103.0
19,Machine Learning Engineer,Programming Languages,Python,71.0
26,Data Engineer,Programming Languages,R,5.0
33,Data Scientist,Programming Languages,R,19.0


In [11]:
# Reset the index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,title,category,skill,frequency
0,Data Engineer,Programming Languages,Python,52.0
1,Data Scientist,Programming Languages,Python,103.0
2,Machine Learning Engineer,Programming Languages,Python,71.0
3,Data Engineer,Programming Languages,R,5.0
4,Data Scientist,Programming Languages,R,19.0


In [12]:
df = df.astype({'frequency': int})
df.dtypes

title        object
category     object
skill        object
frequency     int32
dtype: object

In [13]:
df.to_csv('skill_frequencies.csv')