In [1]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from wordcloud import WordCloud
from textblob import Word

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("WTT_offers.csv")

df = df[["name", "description"]]
df

Unnamed: 0,name,description
0,Manager - Data & Analytics Engineering,The Data team at Welcome to the Jungle: Part o...
1,Margo Analytics - Data Engineer - H/F,Margo Analytics est l'entité experte de Margo...
2,Data Analyst,Être Data Analyst chez Wewyse c’est : Intégr...
3,Data Analyst - Stage de 6 mois,Afin de mieux comprendre nos clients et leurs ...
4,Data Analyst - Stage - Paris,Travailler chez papernest : définition. Cer...
...,...,...
57,DATA ANALYST EXPERT POWER BI,En raison de l'augmentation de l'appétences de...
58,DATA ANALYST MARKETING DIGITAL F/H,REJOIGNEZ UN COLLECTIF ANIME PAR LE GOUT DU DE...
59,Data Analyst Appui au Pilotage (F/H),Contexte Vous souhaitez rejoindre une entrepri...
60,Sénior data scientist/analyst,Intégré(e) en tant que collaborateur et data s...


In [3]:
df["description"] = df["description"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["description"] = df["description"].str.replace('[^\w\s]', ' ')
# df["description"] = df["description"].str.replace('\d+', '')
stop = stopwords.words("french")
df["description"] = df["description"].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
df["description"] = df["description"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

df.head()

  df["description"] = df["description"].str.replace('[^\w\s]', ' ')


Unnamed: 0,name,description
0,Manager - Data & Analytics Engineering,the data team at welcome to the jungle part of...
1,Margo Analytics - Data Engineer - H/F,margo analytics entité experte margo group pro...
2,Data Analyst,être data analyst chez wewyse intégrer communa...
3,Data Analyst - Stage de 6 mois,afin mieux comprendre client leurs besoins aus...
4,Data Analyst - Stage - Paris,travailler chez papernest définition certains ...


In [4]:
df["name"] = df["name"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["name"] = df["name"].str.replace('[^\w\s]', ' ')
stop = stopwords.words("french")
df["name"] = df["name"].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
df["name"] = df["name"].apply(lambda x: ' '.join([Word(word).lemmatize() for word in x.split()]))
df["name"] = df["name"].str.replace('h', '').str.replace('f', '')
df.head()

  df["name"] = df["name"].str.replace('[^\w\s]', ' ')


Unnamed: 0,name,description
0,manager data analytics engineering,the data team at welcome to the jungle part of...
1,margo analytics data engineer,margo analytics entité experte margo group pro...
2,data analyst,être data analyst chez wewyse intégrer communa...
3,data analyst stage 6 mois,afin mieux comprendre client leurs besoins aus...
4,data analyst stage paris,travailler chez papernest définition certains ...


In [5]:
df["name"][df["name"].str.contains("engineer")] = "Data Engineer"
df["name"][df["name"].str.contains("analyst")] = "Data Analyst"
df["name"][df["name"].str.contains("scientist")] = "Data Scientist"

In [6]:
df.name.unique()

array(['Data Engineer', 'Data Analyst'], dtype=object)

In [7]:
df

Unnamed: 0,name,description
0,Data Engineer,the data team at welcome to the jungle part of...
1,Data Engineer,margo analytics entité experte margo group pro...
2,Data Analyst,être data analyst chez wewyse intégrer communa...
3,Data Analyst,afin mieux comprendre client leurs besoins aus...
4,Data Analyst,travailler chez papernest définition certains ...
...,...,...
57,Data Analyst,raison augmentation appétences nicomaticiens d...
58,Data Analyst,rejoignez collectif anime gout defi comme copi...
59,Data Analyst,contexte souhaitez rejoindre entreprise dynami...
60,Data Analyst,intégré e tant collaborateur data scientist rô...


In [8]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.description)
y = df.name

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=109)

clf =  MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\royde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\royde\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
from textblob import TextBlob

technical_skills = ['python', 'c','r', 'c++','java','hadoop','scala','flask','pandas','spark','scikit-learn',
                    'numpy','php','sql','mysql','css','mongdb','nltk', 'fastai', 'keras', 'pytorch','tensorflow',
                   'linux','Ruby','JavaScript','django','react','reactjs','ai','ui','tableau', 'bi', 'powerbi']

feature_array = vectorizer.get_feature_names_out()
features_numbers = len(feature_array)
n_max = int(features_numbers*0.1)

output=pd.DataFrame()
for i in range(0, len(clf.classes_)):
    print("\n***", clf.classes_[i], "***\n")
    class_prob_indices_sorted = clf.feature_log_prob_[i, :].argsort()[::-1]
    raw_skills = np.take(feature_array, class_prob_indices_sorted[:n_max])
    print("list of unprocessed skills :")
    print(raw_skills)

    top_technical_skills = list(set(technical_skills).intersection(raw_skills))[:6]

    txt = " ".join(raw_skills)
    blob = TextBlob(txt)

    top_adjectives = [w for (w, pos) in TextBlob(txt).pos_tags if pos.startswith("JJ")][:6]

    output = output.append({"job_title" : clf.classes_[i],
                            "technical_skills" : top_technical_skills,
                            "soft_skills" : top_adjectives },
                            ignore_index = True)


*** Data Analyst ***

list of unprocessed skills :
['data' 'données' 'the' 'and' 'équipe' 'to' 'our' 'équipes' 'client' 'of'
 'analyse' 'mise' 'outils' 'mission' 'analyst' 'besoins' 'métiers' 'suivi'
 'sein' 'analysis' 'bi' 'you' 'place' 'pilotage' 'analyser' 'performance'
 'service' 'qualité' 'solution' 'reporting' 'tf1' 'gestion' 'business'
 'team' 'tableau' 'with' 'projets' 'produits' 'power' 'indicateurs'
 'accompagner' 'charge' 'entreprise' 'direction' 'afin' 'nouveaux'
 'ensemble' 'développement' 'via' 'réaliser' 'être' 'participer' 'poste'
 'cadre' 'recherchons' 'product' 'traitement' 'plus' 'marketing' 'produit'
 'nouvelles' 'protection' 'we' 'analytics' 'application' 'quotidien'
 'assurer' 'leurs' 'recommandation' 'technique' 'interne' 'mettre'
 'dashboard' 'bord' 'production' 'optimisation' 'science' 'métier'
 'groupe' 'différents' 'activité' 'développer' 'méthodologie' 'cohérence'
 'jours' 'définition' 'bien' 'in' 'exploiter' 'rattaché' 'faire'
 'construire' 'mener' 'tous' 

  output = output.append({"job_title" : clf.classes_[i],
  output = output.append({"job_title" : clf.classes_[i],


In [12]:
print(output.T)

                                                                  0  \
job_title                                              Data Analyst   
technical_skills                [bi, tableau, sql, powerbi, python]   
soft_skills       [analyse, sein, pilotage, qualité, tf1, entrep...   

                                                                  1  
job_title                                             Data Engineer  
technical_skills      [java, scala, hadoop, spark, tableau, python]  
soft_skills       [objectif, fondateurs, établir, interprète, gr...  
