In [None]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000
from collections import Counter

# magic word for producing visualizations in notebook
%matplotlib inline

In [None]:
# Load in the general 2018 dataset.
survey_2018 = pd.read_csv("survey_results_public_2018.csv", dtype=str)

In [None]:
print(survey_2018.head())

In [None]:
print(survey_2018.shape)
survey_2018.describe()

In [None]:
# Perform an assessment of how much missing data there is in each column of the
# dataset.
def count_outliers(dataset):
    # output percentage of values that are missing
    print(100 * dataset.isnull().sum() / dataset.shape[0])

In [None]:
count_outliers(survey_2018)

For the purposes of this analysis, I will not be removing columns / rows initially or imputing any missing data values. I may however, remove certain rows / columns as I progress through the questions.


# Question 1: What types of languages, databases, platforms and frameworks were used in 2018 and what will be the trend for 2019?¶

Let us take a look at what the survey said about the technologies used in 2018 and what technologies people are looking at utilizing in 2019. We will be analyzing the following columns:

- LanguageWorkedWith
- LanguageDesireNextYear
- DatabaseWorkedWith
- DatabaseDesireNextYear
- PlatformWorkedWith
- PlatformDesireNextYear
- FrameworkWorkedWith
- FrameworkDesireNextYear

In [None]:
#Making a temp variable of the 8 columns above     
temp_dataframe = survey_2018[['LanguageWorkedWith','LanguageDesireNextYear','DatabaseWorkedWith','DatabaseDesireNextYear','PlatformWorkedWith','PlatformDesireNextYear','FrameworkWorkedWith','FrameworkDesireNextYear']]

In [None]:
#Printing out the first 5 rows of each column
print(temp_dataframe.shape)
print(temp_dataframe.head())

In [None]:
temp_dataframe['LanguageWorkedWith'].unique()

As we can see, many of the rows have multiple values. We are going to be splitting up the cells so that we can account for each occurance of the technologies listed.

In [None]:
temp_dataframe.dropna(subset=['LanguageWorkedWith','LanguageDesireNextYear','DatabaseWorkedWith','DatabaseDesireNextYear','PlatformWorkedWith','PlatformDesireNextYear','FrameworkWorkedWith','FrameworkDesireNextYear'], inplace = True)

In [None]:
temp_dataframe.head()

In [None]:
def clean_and_plot(df, title, plot=True):
    '''
    INPUT 
        df - a dataframe holding the CousinEducation column
        title - string the title of your plot
        axis - axis object
        plot - bool providing whether or not you want a plot back
        
    OUTPUT
        study_df - a dataframe with the count of how many individuals
        Displays a plot of pretty things related to the CousinEducation column.
    '''
    temp_df = df.str.split(';')
    flat_list = [item for sublist in df for item in sublist]
    plot_list = Counter(flat_list)
    plt.figure(figsize=(20,6))
    plt.title(title)
    plt.bar(range(len(plot_list)), plot_list.values(), align="center")
    plt.xticks(range(len(plot_list)), list(plot_list.keys()))
    plt.xlabel('Technologies')
    plt.ylabel('Count')
    plt.show()

In [None]:
clean_and_plot(temp_dataframe['LanguageWorkedWith'], '2018 Languages')

In [None]:
clean_and_plot(temp_dataframe['LanguageDesireNextYear'], '2019 Languages')

In [None]:
clean_and_plot(temp_dataframe['DatabaseWorkedWith'], '2018 Databases')

In [None]:
props_df = clean_and_plot(temp_dataframe['DatabaseDesireNextYear'], '2019 Database')

In [None]:
props_df = clean_and_plot(temp_dataframe['PlatformWorkedWith'], '2018 Platforms')

In [None]:
props_df = clean_and_plot(temp_dataframe['PlatformDesireNextYear'], '2019 Platforms')

In [None]:
props_df = clean_and_plot(temp_dataframe['FrameworkWorkedWith'], '2018 Frameworks')

In [None]:
props_df = clean_and_plot(temp_dataframe['FrameworkDesireNextYear'], '2019 Frameworks')

# Question 2: What do people tend to look for more in a potential job?

Let us take a look at what people tend to look for in a potential job. We will be utilizing the following columns and their associated descriptions to answer this question (1 is the most important and 10 is the least important):

- AssessJob1: The industry that I'd be working in
- AssessJob2: The financial performance or funding status of the company or organization
- AssessJob3: The specific department or team I'd be working on
- AssessJob4: The languages, frameworks, and other technologies I'd be working with
- AssessJob5: The compensation and benefits offered
- AssessJob6: The office environment or company culture
- AssessJob7: The opportunity to work from home/remotely
- AssessJob8: Opportunities for professional development
- AssessJob9: The diversity of the company or organization
- AssessJob10: How widely used or impactful the product or service I'd be working on is

In [None]:
#Making a temp variable of the 10 columns above
temp_dataframe = survey_2018[['AssessJob1','AssessJob2','AssessJob3','AssessJob4','AssessJob5','AssessJob6','AssessJob7','AssessJob8', 'AssessJob9', 'AssessJob10']]

In [None]:
#Printing out the first 5 rows of each column
print(temp_dataframe.shape)
print(temp_dataframe.head())

In [None]:
#Determining how much of a percentage of rows have NaN
count_outliers(temp_dataframe)

It is very interesting that every single column has the exact percentage of NaN. Let us look more into that to see what we can learn

In [None]:
#Get a preliminary view of what the rows look like that have Null for the Assess Job Columns
survey_2018[survey_2018['AssessJob1'].isnull()]

In [None]:
#Get a feel for what the rows look like that have values for the AssessJob columns
survey_2018[survey_2018['AssessJob1'].notnull()]

In [None]:
#At this point, I am only interested in people that reported results.  I will make a note of the NaN in the findings
#report.  
survey_2018 = survey_2018.dropna(subset=['AssessJob1'])

In [None]:
#Getting subset of overall dataset to only the Job Assessment columns for job seekers
temp_dataframe = survey_2018[['AssessJob1','AssessJob2','AssessJob3','AssessJob4','AssessJob5','AssessJob6','AssessJob7','AssessJob8', 'AssessJob9', 'AssessJob10']]

In [None]:
#Performing a quick validation that we no longer see NaN
print(temp_dataframe.head())

In [None]:
#Renaming the columns prior to submission to the visualization methods that will be listed below
temp_dataframe = temp_dataframe.rename(columns={'AssessJob1': 'Type of Industry', 'AssessJob2': 'Financial Performance', 'AssessJob3': 'Department_Team', 'AssessJob4': 'Type of Technology', 'AssessJob5': 'Compensation', 'AssessJob6': 'Culture', 'AssessJob7': 'Remote Work', 'AssessJob8': 'Professional Development', 'AssessJob9': 'Diversity', 'AssessJob10': 'Use of Product'})

In [None]:
#Visualizing all of the Job Assessment columns to see what potential job seekers are looking for.
#Ranking order is 1 (highest) to 10 (lowest)
count = 1
for y in ['Type of Industry', 'Financial Performance',  'Department_Team', 'Type of Technology', 'Compensation','Culture',  'Remote Work', 'Professional Development', 'Diversity', 'Use of Product']:
    plt.figure(figsize=(20,6))
    plt.title(y)
    plt.xlabel('x')
    plt.ylabel('y')
    ax = sns.countplot(temp_dataframe[y])
    plt.show()