# WHAT ARE THE MOST OPTIMAL SKILLS TO LEARN AS A DATA PERSON 
- Most Required Skills to Learn as a Data Analyst
- Most Required Skills to Learn as a Data Scientist
- Most Required Skills to Learn as a Data Engineer
- Most Optimal Skills to Learn for Data Jobs Combined

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('0_cleaned_data.csv')

In [None]:
# Renaming 'description_tokens' column to 'skills'
df.rename(columns={'description_tokens': 'skills'}, inplace=True)

## Most Required Skills to Learn as a Data Analyst
Utilize frequency analysis on job postings to identify and rank the top technical and analytical skills essential for a Data Analyst role, such as SQL, Excel, and data visualization tools.

In [None]:
df_analyst = df[df['job_title'] == 'Data Analyst']
total_data_analyst_positions = len(df_analyst)

# Converting string representation of list to actual list
# and then flattening the list of lists for Data Analysts
analyst_skills = [skill for sublist in df_analyst['skills'].dropna() for skill in ast.literal_eval(sublist)]

# Counting the frequency of each skill for Data Analysts
analyst_skill_counts = Counter(analyst_skills)

# Getting the top 10 most common skills for Data Analysts
top_analyst_skills = analyst_skill_counts.most_common(10)

# Calculating percentages instead of counts
top_analyst_skills_percentages = [(skill, (count / total_data_analyst_positions) * 100) for skill, count in top_analyst_skills]

# Extracting skills and their percentages
skills_analyst, percentages_analyst = zip(*top_analyst_skills_percentages)

plt.figure(figsize=(14, 8))
barplot = sns.barplot(x=list(percentages_analyst), y=list(skills_analyst), orient='h', palette=['#ff6f69']*10)
plt.title('Most Required Skills for Data Analyst Roles')
plt.xlabel('Percentage')

# Adding the text labels on the bars
for index, value in enumerate(percentages_analyst):
    plt.text(value, index, f'{value:.2f}%')

plt.show()


## Most Required Skills to Learn as a Data Scientist
Analyze the data to determine the most frequently demanded skills for Data Scientists, focusing on areas like machine learning, statistical analysis, and programming languages like Python and R.

In [None]:
df_scientist = df[df['job_title'] == 'Data Scientist']
total_data_scientist_positions = len(df_scientist)

# Converting string representation of list to actual list
# and then flattening the list of lists for Data Scientists
scientist_skills = [skill for sublist in df_scientist['skills'].dropna() for skill in ast.literal_eval(sublist)]

# Counting the frequency of each skill for Data Scientists
scientist_skill_counts = Counter(scientist_skills)

# Getting the top 10 most common skills for Data Scientists
top_scientist_skills = scientist_skill_counts.most_common(10)

# Calculating percentages instead of counts
top_scientist_skills_percentages = [(skill, (count / total_data_scientist_positions) * 100) for skill, count in top_scientist_skills]

# Extracting skills and their percentages
skills_scientist, percentages_scientist = zip(*top_scientist_skills_percentages)

plt.figure(figsize=(14, 8))
sns.barplot(x=list(percentages_scientist), y=list(skills_scientist), orient='h', palette=['#005b96']*6)
plt.title('Most Required Skills for Data Scientist Roles')
plt.xlabel('Percentage')

# Adding the text labels on the bars
for index, value in enumerate(percentages_scientist):
    plt.text(value, index, f'{value:.2f}%')

plt.show()

## Most Required Skills to Learn as a Data Engineer
Data analysis to identify the key skills needed for Data Engineers, such as expertise in database management, ETL processes, and big data technologies Spark.

In [None]:
df_engineer = df[df['job_title'] == 'Data Engineer']
total_data_engineer_positions = len(df_engineer)

# Converting string representation of list to actual list
# and then flattening the list of lists for Data Engineers
engineer_skills = [skill for sublist in df_engineer['skills'].dropna() for skill in ast.literal_eval(sublist)]

# Counting the frequency of each skill for Data Engineers
engineer_skill_counts = Counter(engineer_skills)

# Getting the top 10 most common skills for Data Engineers
top_engineer_skills = engineer_skill_counts.most_common(10)

# Calculating percentages instead of counts
top_engineer_skills_percentages = [(skill, (count / total_data_engineer_positions) * 100) for skill, count in top_engineer_skills]

# Extracting skills and their percentages
skills_engineer, percentages_engineer = zip(*top_engineer_skills_percentages)

plt.figure(figsize=(14, 8))
sns.barplot(x=list(percentages_engineer), y=list(skills_engineer), orient='h', palette=['#ffcc5c']*6)
plt.title('Most Required Skills for Data Engineer Roles')
plt.xlabel('Percentage')

# Adding the text labels on the bars
for index, value in enumerate(percentages_engineer):
    plt.text(value, index, f'{value:.2f}%')

plt.show()

## Most Optimal Skills to Learn for Data Jobs
Perform a comparative analysis of skill requirements across diverse data-centric roles to isolate valuable skills.

In [None]:
## THE SKILL PERCENTAGE DATA WAS METICULOUSLY RECORDED IN AN EXCEL FILE BEFORE BEING EXPORTED AS A CSV FILE NAMED 'skill_percentages_csv'. RIGOROUS CHECKS WERE CONDUCTED TO ENSURE ACCURACY AND RELIABILITY OF THE DATA ##
df2 = pd.read_csv('skill_percentages.csv')
df2

In [None]:
# Plotting the skill percentages for 3 main data jobs
df2_cleaned = df2.dropna() # Cleaning and preparing the data from df2 for plotting

# Converting percentage strings to floats
for col in ['Data Analyst', 'Data Scientist', 'Data Engineer']:
    df2_cleaned[col] = df2_cleaned[col].str.rstrip('%').astype('float')

# Melting the dataframe to long format for easier plotting
long_df2 = df2_cleaned.melt(id_vars=['Skills'], value_vars=['Data Analyst', 'Data Scientist', 'Data Engineer'],
                           var_name='Job Title', value_name='Percentage')

plt.figure(figsize=(14, 8))
sns.barplot(x='Percentage', y='Skills', hue='Job Title', data=long_df2, orient='h', palette=['#ff6f69', '#005b96', '#ffcc5c'])

#'#011f4b', '#005b96', '#b3cde0'

plt.title('Most Popular Skills by Job Title')
plt.xlabel('Percentage')
plt.legend(title='Job Title')

plt.show()

## Top 10 Skills for Data Careers
The most on demand skills for all data-driven jobs together. 

In [None]:
# Extracting and Plotting Most Frequent Skills by Specific Job Titles
selected_titles = ['Data Analyst', 'Data Scientist', 'Data Engineer']
df_selected = df[df['job_title'].isin(selected_titles)]

from collections import Counter
import ast

# Function to count skills
def count_skills(data):
    skills_list = []
    for item in data['skills']:
        # Convert string representation of list to actual list
        item_list = ast.literal_eval(item)
        skills_list.extend(item_list)
    return Counter(skills_list)

# Counting skills for selected job titles
skills_count = count_skills(df_selected)

# Sorting and selecting top skills
top_skills = dict(skills_count.most_common(10))

plt.figure(figsize=(12, 8))
sns.barplot(x=list(top_skills.values()), y=list(top_skills.keys()), orient='h', palette='Blues_r')
plt.title('Top 10 Skills for Data Analysts, Data Scientists, and Data Engineers')
plt.xlabel('Number of Job Postings')
plt.show()