# Exploratory Data Analysis (EDA) 
- Descriptive Statistics
- Top Data Related Job Titles in Demand
- Leading Job Platforms for Data Careers
- Essential Skills for Data Jobs
- Comparing Salaries Across Data Roles

In [None]:
import pandas as pd
df = pd.read_csv('0_cleaned_data.csv') # this is the cleaned version

In [None]:
df

## Descriptive Statistics
Generate basic statistical summaries like mean, median, standard deviation and range to understand the fundamental tendencies and dispersion in the dataset.

In [None]:
 df.describe().round(2)

## Top Data Related Job Titles in Demand: What are the Most Mopular Job Titles?
Identify the most frequently advertised job titles in the data field to determine which positions are currently most sought-after by employers.

In [None]:
df['job_title'].value_counts().head(10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
title_counts = df['job_title'].value_counts().head(10)

plt.figure(figsize=(12, 8))

barplot = sns.barplot(x=title_counts.values, y=title_counts.index,                            
palette=sns.color_palette("coolwarm", n_colors=10))

plt.title('Top 10 Job Titles in Data Jobs')
plt.xlabel('Number of Postings')

plt.show()

## Leading Job Platforms for Data Careers: What are the Most Commonly Used Job Platforms?
Analyze which online platforms are most commonly used for posting data-related job vacancies, indicating where job seekers are most likely to find these opportunities.

In [None]:
df['job_platform'].value_counts().head(10)

In [None]:
platform_counts = df['job_platform'].value_counts().head(10)

palette = sns.color_palette("Blues_r", n_colors=12)

plt.figure(figsize=(12, 8))
barplot = sns.barplot(x=platform_counts.values, y=platform_counts.index, palette=palette)

plt.title('Top 10 Job Platforms in Data Jobs')
plt.xlabel('Number of Postings')

plt.show()

## Essential Skills for Data Jobs: What are the Most On-Demand Skills in Data Jobs?
Extract and prioritize the key skills and qualifications mentioned across job listings to highlight the most in-demand competencies for data professionals.

In [None]:
df.rename(columns={'description_tokens': 'skills'}, inplace=True)

In [None]:
# Finding the top 10 most frequent skills in 'skills' column
from collections import Counter
import ast

# Converting string representation of list to actual list
# and then flattening the list of lists
all_skills = [skill for sublist in df['skills'].dropna() for skill in ast.literal_eval(sublist)]

# Counting the frequency of each skill
skill_counts = Counter(all_skills)

# Getting the top 10 most common skills
top_skills = skill_counts.most_common(10)

# Displaying the top 10 skills
for skill, count in top_skills:
    print(f"{skill}: {count}")

### Most Popular Skills in Data Jobs

In [None]:
skills, counts = zip(*top_skills) # Extracting skills and their counts

plt.figure(figsize=(12, 8))
barplot = sns.barplot(x=list(counts), y=list(skills), palette=sns.color_palette("rocket", 10))

plt.title('Top 10 Most Frequent Skills in Data Jobs')
plt.xlabel('Frequency')

plt.show()

### Most Popular Skills in Data Jobs as Percentage

In [None]:
total_rows = len(df)

# Calculating the percentage for each of the top 10 skills
skill_percentages = [(skill, (sum(skill in skills for skills in skill_lists) / total_rows) * 100) for skill, _ in top_skills]
sorted_skills = sorted(skill_percentages, key=lambda x: x[1], reverse=True)

skills, percentages = zip(*sorted_skills) # Extracting sorted skill names and percentages

plt.figure(figsize=(12, 8))
barplot = sns.barplot(x=list(percentages), y=list(skills), palette=sns.color_palette("Paired", 10))
plt.title('Frequency of Top 10 Skills in Data Job Listings')
plt.xlabel('Percentage')

plt.show()

### Most Popular Skills in Data Jobs as Percentage Relative to Total Number of Skills

In [None]:
total_skills = sum(skill_counts.values())

# Calculating the percentage for each skill
percentages = [(count / total_skills) * 100 for skill, count in top_skills]

# Extracting skill names
skills = [skill for skill, count in top_skills]

plt.figure(figsize=(12, 8))
barplot = sns.barplot(x=percentages, y=skills, palette=sns.color_palette("Greens_r", 10))
plt.title('Top 10 Most Frequent Skills by Percentage')
plt.xlabel('Percentage')

plt.show()


## Comparing Salaries Across Data Roles
A comparative analysis of salary offerings for different data-related job titles to understand compensation trends in the field.

### Average Salaries Data Analyst, Data Scientist, Business Analyst and Data Engineer Roles

In [None]:
selected_titles = ['Data Analyst', 'Data Scientist', 'Data Engineer', 'Business Analyst']
avg_salaries = df[df['job_title'].isin(selected_titles)].groupby('job_title')['salary_standardized'].mean().sort_values(ascending=False)
avg_salaries

In [None]:
plt.figure(figsize=(12, 8))
barplot = sns.barplot(x=avg_salaries.values, y=avg_salaries.index, color='#005b96')
plt.title('Average Salaries for Data Analyst, Data Scientist, Data Engineer and Business Analyst')
plt.xlabel('Average Salary (USD)')

plt.show()

### Average Salaries for Senior Data Job Titles

In [None]:
senior_titles = ['Senior Data Analyst', 'Senior Data Scientist', 'Senior Data Engineer', 'Senior Business Analyst']
senior_avg_salaries = df[df['job_title'].isin(senior_titles)].groupby('job_title')['salary_standardized'].mean().sort_values(ascending=False)
senior_avg_salaries

In [None]:
plt.figure(figsize=(10, 6))
barplot = sns.barplot(x=senior_avg_salaries.values, y=senior_avg_salaries.index, color='#005b96')
plt.title('Average Salaries for Senior Data Analyst, Senior Data Scientist, Senior Data Engineer and Senior Business Analyst')
plt.xlabel('Average Salary (USD)')
plt.ylabel('Job Title')
plt.show()