In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
data = pd.read_csv('clean_data.csv', sep=',')
df = data.copy()
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Intitulé du poste,Date de publication,lieu,competences,salaire_minimum,salaire_maximum,Nom de la société,Type de contrat
0,data analyst,2022-12-31,maisons alfort,"support, sql, data management, business object...",45000.0,50000.0,fed it,cdd
1,data analyst,2022-12-18,paris,"data management, business intelligence, bases ...",60000.0,62000.0,fed it,cdi
2,data scientist,2022-12-18,rueil malmaison,"marketing, data management, support",60000.0,65000.0,fed it,cdi
3,consultant,2023-01-13,paris,"big data, oracle, cloud, sap, python",60000.0,70000.0,selescope,
4,chef de projet,2023-01-10,gennevilliers,"marketing, outils, digital, ux, mobile",60000.0,70000.0,hays france,cdi
5,autres,2023-01-15,boulogne-billancourt,"si, data management, data quality, méthodologi...",55000.0,60000.0,michael page,cdi
6,business analyst,2022-12-17,suresnes,"business intelligence, crm, intelligence artif...",70000.0,100000.0,selescope,
7,business analyst,2022-12-27,suresnes,"intelligence artificielle, business intelligen...",70000.0,100000.0,selescope,
8,business analyst,2013-01-14,rueil malmaison,"business intelligence, marketing, intelligence...",35000.0,45000.0,fed it,cdi
9,data scientist,2022-12-17,paris,"machine learning, méthodologie, agile",45000.0,70000.0,key consulting,cdi


In [4]:
df.isnull().mean() * 100
df['Intitulé du poste'].value_counts()

engineer            17
data analyst         4
data scientist       4
business analyst     4
chef de projet       3
autres               3
developer            3
consultant           2
architect            2
expert               1
Name: Intitulé du poste, dtype: int64

In [10]:
cv = CountVectorizer()
comp_tf = cv.fit_transform(df['competences']).toarray()
comp_tf = pd.DataFrame(data = comp_tf, columns = cv.get_feature_names_out()) # permet d'obtenir un df avec les bons noms de colonne
names_comp = comp_tf.columns
comp_tf['Intitulé du poste'] = df['Intitulé du poste']
comp_tf.columns

Index(['agile', 'angularjs', 'artificielle', 'bases', 'big', 'business', 'cft',
       'cisco', 'cloud', 'communication', 'crm', 'data', 'de', 'devops',
       'digital', 'données', 'framework', 'hadoop', 'hp', 'iaas',
       'infrastructure', 'intelligence', 'j2ee', 'java', 'javascript', 'jboss',
       'jenkins', 'lan', 'langage', 'learning', 'linux', 'machine',
       'maintenance', 'management', 'marketing', 'microsoft', 'middleware',
       'mobile', 'méthodologie', 'nosql', 'objects', 'optimisation', 'oracle',
       'outils', 'paas', 'poste', 'python', 'quality', 'recette', 'réseau',
       'réseaux', 'saas', 'sap', 'securité', 'server', 'si', 'spring', 'sql',
       'support', 'système', 'sécurité', 'tomcat', 'travail', 'unix', 'ux',
       'vba', 'vmware', 'wan', 'web', 'weblogic', 'windows',
       'Intitulé du poste'],
      dtype='object')

In [6]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1)

fig.append_trace(go.Bar(
    x=(df["Nom de la société"]).astype(str),
    y=(df["salaire_maximum"]).astype(int), name="salaire_maximum"
), row=1, col=1)

fig.append_trace(go.Bar(
    x=(df["Nom de la société"]).astype(str),
    y=(df["salaire_minimum"]).astype(int), name="salaire_minimum"
), row=2, col=1)


fig.update_layout(height=800, width=1000, title_text="Salary", showlegend=True)
fig.show()

In [7]:
postes = df['Intitulé du poste'].unique() # donne la liste des valeurs possibles
df_gb = comp_tf.groupby('Intitulé du poste')[names_comp].sum()

In [8]:
# caculate the occurrence of competences
occurrence_comp = comp_tf.sum(axis=0)

df_occ_comp = pd.DataFrame(occurrence_comp)
df_occ_comp["competences"] = df_occ_comp.index
df_occ_comp.rename(columns={0 : "occurrence"}, inplace = True)

#df_occ_comp.drop(['Intitulé du poste'], axis=0, inplace=True)

fig = px.bar(df_occ_comp, x="competences", y='occurrence')
fig.show()

In [9]:

fig = px.scatter(df_gb, title="Répartition des compétences pour les différents Intitulés du poste")
fig.update_traces(marker={'size': 15})
fig.show()