In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import webbrowser
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import pyLDAvis.sklearn
from sklearn.model_selection import GridSearchCV
from pymongo.mongo_client import MongoClient
from bson import ObjectId
from pickle import dump

<h1> Partie 1: </h1>
<h1> Traitement des données </h1>

<h3> chargement des données : </h3>

In [2]:
df = pd.read_csv("wevioo6.csv",index_col="id")
df.head()

Unnamed: 0_level_0,languages,organizations,projects,certifs,recommendations,foll_mean,foll_sd,emp_mean,emp_sd,lk_emp_mean,...,pylons,magneto,ntask,looker,knime,waterfall,rackspace,squarespace,codeship,ikoula
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5e20582f2d080b22a8f778ad,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5e20582f2d080b22a8f778ae,3,1,0,0,2,155167.666667,197516.032994,3500.333333,4601.195521,5742.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5e20582f2d080b22a8f778af,3,1,5,0,0,169.285714,257.728381,78.571429,172.910263,26.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5e20582f2d080b22a8f778b6,0,0,0,0,0,68.833333,153.916012,168.333333,371.950564,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5e20582f2d080b22a8f778b0,4,6,5,4,0,59964.0,58812.438123,2887.5,4120.129701,3177.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h2> connexion à la base de données : </h2>
<p>
    il faut avoir les 2 collections :
    <ul> 
        <li>
            <strong>"dataBrut"</strong> qui contient les données du départ
        </li>    
        <br>
        <li>
            <strong>"final"</strong> qui contient les données finales sous forme du json
        </li>    
    </ul>
</p>

In [3]:
client = MongoClient()
db = client.get_database('pi').get_collection('dataBrut')
final = client.get_database('pi').get_collection('final')

<h2> traitement des données : </h2>
    <p> Cette étape consiste à attribuer des coefficients aux "features"</p>
    <p> Il suffit juste du changer les coeff de chaque catégorie dans le dictionnaire ci-dessous :</p>
    <p> <strong> si une categorie de features a un coeff 0 elle va  être éliminé du dataframe </strong></p>

In [4]:
sums = ["languages","organizations","projects","certifs","recommendations"]
dates = ["date_range","date_mean"]
educations = ["eng","mast","lic","phd"]
others = [
"foll_mean",
"foll_sd",
"emp_mean",
"emp_sd",
"lk_emp_mean",
"lk_emp_sd",
"Société civile/Société commerciale/Autres types de sociétés",
"Administration publique",
"Établissement éducatif",
"Société de personnes (associés)",
"Société cotée en bourse",
"Non lucratif",
"Entreprise individuelle",
"Travailleur indépendant ou profession libérale",
]
skills = [c for c in df.columns if c.startswith("sk")]
pro_skills = [c for c in df.columns if c not in sums+dates+educations+others+skills]


coeff = [
    (sums,0),
    (dates,0),
    (educations,0),
    (others,0),
    (skills,0.5),
    (pro_skills,3)
    ]

In [5]:
def get_info(id):
    obj = db.find_one({'_id':ObjectId(id)})
    #print(obj)
    return obj["url"], obj["personal_info"]["name"], obj["personal_info"]["headline"], obj["search"]

def get_us (id):
    obj = db.find_one({'_id':ObjectId(id)})
    return obj

def get_us_final (id):
    obj = final.find_one({'id':id})
    return obj

def show_topic(profile, topics_list):
    return topics_list[np.argmax(lda.transform(lda_df.loc[profile].values.reshape(1,-1)))]

def open_browser(link):
    webbrowser.open(link)
    
def update_coef(df, coeff):
    tmp = df.copy()
    for key, val in coeff:
        if val ==0 : 
            tmp.drop(key,axis=1, inplace=True)
        else :    
            for item in key:
                tmp[item] = tmp[item].apply(lambda x : x*val)
    return tmp        

def unit_pro_skills(df):
    tmp = df.copy()
    for item in tmp[pro_skills]:
        tmp[item] = tmp[item].apply(lambda x : 1 if x !=0 else 0)
    return tmp

<p> filtrage et traitement du features selon le dictionnaire du coeff ci dessus 
    (on peut remarquer que le nombre de features a diminué puisque il y'a des categories du features qui ont un coeff eégale à 0)</p>

In [6]:
df_updated = update_coef(unit_pro_skills(df), coeff)
df_updated.head()

Unnamed: 0_level_0,sk_kubernetes,sk_elasticsearch,sk_html,sk_vagrant,sk_oracle,sk_kibana,sk_c\+\+,sk_mysql,sk_logstash,sk_apache,...,pylons,magneto,ntask,looker,knime,waterfall,rackspace,squarespace,codeship,ikoula
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5e20582f2d080b22a8f778ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5e20582f2d080b22a8f778ae,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0,0,0,0,0,0,0,0,0,0
5e20582f2d080b22a8f778af,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.5,...,0,0,0,0,0,0,0,0,0,0
5e20582f2d080b22a8f778b6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5e20582f2d080b22a8f778b0,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.5,0.0,0.5,...,0,0,0,0,0,0,0,0,0,0


<h1> Partie 2 : </h1> 
<h1> Étude Statistique </h1>

<h2> Applications du clustering </h2>
<br>
<p> L'objectif dans cette étape est de reduire la dimensionnalité du dataframe puisque on a plus que de 500 colonnes. La meilleure façon est d'appliquer une LSA ou une LDA qui permettent de resumer les données a une dimension trés inférieure. mais tout d'abord, il faut connaitre le nombre de categories qu'ils sont présents dans les données. On sait d'aprés un étude preliminéaire qu'on a plus que 10 categories. mais on ne sait pas le nombre exacte. donc l'une des méthodes est d'appliquer un clustering pour mieux estimer le nombre de categories (topics)</p>   

In [None]:
tmp_arr = [] 
for i in range (7,20):
    km =  KMeans(i)
    km.fit(df_updated[pro_skills])
    results = km.predict(df_updated[pro_skills])
    tmp_arr.append(silhouette_score(df_updated[pro_skills], results))

In [None]:
plt.figure(figsize=(15,10))
plt.plot(range(7,20), tmp_arr)

<h2> Applications du LDA </h2>
<br>
<p> L'objectif ici est de visualiser les categories qu'on a : </p>   
<p> Aprés des tests on a remarqué que le nombre optimale du categories est à l'entours du 16</p>

In [7]:
lda = LatentDirichletAllocation(12,random_state=19)

In [8]:
lda_df = df_updated[pro_skills]
lda.fit(lda_df)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=12, n_jobs=None,
                          perp_tol=0.1, random_state=19, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [9]:
topics = pd.DataFrame(lda.components_, columns=lda_df.columns)

In [10]:
topics_list=[]
for i in range(len(topics)):
    tmp = topics.iloc[i,:].sort_values(ascending=False)[:15]
    print(tmp)
    topics_list.append(tmp)
    print("\n***********************************\n")

python            1720.216405
learning          1541.513080
deep               576.083327
classification     540.705761
network            454.940530
mining             369.402057
django             334.360430
tensorflow         267.083330
clustering         254.879047
scrap              202.969359
rest               202.478154
statistica         189.076974
flask              179.726072
scikit             175.796109
keras              174.083330
Name: 0, dtype: float64

***********************************

java          3683.324103
jee           2390.886952
spring        2228.409664
angular       1895.973456
hibernate     1166.904950
jsf            689.500300
rest           646.698032
maven          564.620420
mysql          547.290715
scrum          368.350706
git            322.158913
postgresql     280.387647
ips            192.794331
bootstrap      181.385622
cloud          174.338321
Name: 1, dtype: float64

***********************************

cloud         1661.335354
ember     

In [12]:
topics_dict = {
    0:"python/ml",
    1:"java/jee/angular",
    2:"cloud/linux",
    3:"php/ symfony",
    4:"scrum/agile",
    5:"spring java",
    6:"c / c++ / embarque",
    7:"testing",
    8:"management",
    9:"web .net",
    10:"big data",
    11:"networking",
    12:"databases"}

In [13]:
def get_cat_test(id):
    tmp=[]
    args  = lda.transform(df_updated[pro_skills].loc[id].values.reshape(1,-1))
    for i in range(len(args[0])): 
        if args[0][i] > 0.1:
            tmp.append(topics_dict[i])
    print(tmp)
    #print(topics_list[args])
    print(get_info(id)[2:])
    print("******************************************************************************")

<h2> LSA </h2>
<p> l'application de LSA nous permet de reduire la dimension du notre dataframe du plus que 500 features vers environ 16 </p>

In [11]:
svd = TruncatedSVD(50)
df_optim = pd.DataFrame(svd.fit_transform(df_updated), index=df_updated.index)
df_optim

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5e20582f2d080b22a8f778ad,-5.089116e-18,-1.084344e-14,-1.238055e-16,-2.202485e-14,-2.268344e-14,-4.836109e-14,-1.785017e-14,-1.839822e-14,9.064901e-14,-5.173591e-14,...,-3.133736e-14,-1.446075e-14,1.414854e-14,-2.418163e-14,4.285441e-14,2.797524e-14,8.282156e-15,-8.169278e-16,3.837207e-14,2.133906e-14
5e20582f2d080b22a8f778ae,4.921136e+00,-4.533432e-01,9.956072e-01,2.839638e+00,-2.825841e+00,-7.650696e-01,-6.439435e-01,1.314160e+00,-2.770777e-01,-2.316935e+00,...,6.239871e-03,-2.472030e+00,-1.824420e+00,-1.523505e-01,-1.856718e+00,-2.097323e-01,-7.362980e-02,-1.140248e+00,7.812292e-01,9.506693e-01
5e20582f2d080b22a8f778af,2.135068e-01,-1.048440e-01,5.954049e-02,-4.511847e-02,-2.254960e-01,-1.250694e-02,5.152063e-02,1.777751e-02,-1.174244e-01,-7.338148e-02,...,-3.250907e-01,9.272272e-02,-6.285319e-02,3.931224e-01,-3.419186e-01,-2.454405e-02,1.677632e-01,-2.362742e-01,-2.424538e-01,4.194508e-02
5e20582f2d080b22a8f778b6,8.395725e-02,8.409246e-02,2.160605e-01,-4.286242e-02,1.909983e-02,-1.163251e-01,-8.029908e-02,-2.019155e-01,-5.290549e-02,1.037444e-01,...,2.888663e-01,-2.263893e-01,5.206424e-01,9.705414e-01,8.671337e-02,3.718435e-01,5.119506e-01,8.655475e-02,7.382682e-01,-4.125966e-02
5e20582f2d080b22a8f778b0,4.980552e+00,2.369446e+00,1.857211e+00,2.668695e+00,-9.876541e-01,1.901342e+00,-2.577967e-03,-3.458022e+00,1.378077e+00,-2.711166e-01,...,5.632219e-01,5.712368e-01,-8.653373e-01,-1.319804e+00,-1.185450e+00,1.592850e-01,2.527420e-01,4.140520e-01,-3.434753e-01,2.084769e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5e2058312d080b22a8f79fb9,1.192645e-01,6.345781e-03,1.124764e-01,-1.667187e-02,-1.519659e-01,-5.671496e-02,1.023575e-02,-1.356153e-02,-4.668997e-02,1.675943e-02,...,-2.208482e-01,5.434269e-02,-3.835747e-02,2.636075e-01,-2.490862e-01,-7.329530e-03,1.450665e-01,-1.293733e-01,-1.454870e-01,1.050718e-02
5e2058312d080b22a8f79fb3,5.455564e+00,-3.577141e+00,-1.173258e+00,-4.200504e-01,-1.183675e+00,-1.070937e+00,2.140451e-01,6.320309e-01,2.985868e-01,-5.680187e-01,...,-3.351268e-01,-1.469546e-01,4.040353e-01,9.530620e-01,4.291482e-01,2.656300e-01,-2.126751e-01,2.998752e-01,-1.020787e+00,-8.865480e-02
5e2058312d080b22a8f79fb8,2.727150e+00,4.253238e-01,-1.055128e+00,-5.219450e-01,-2.027698e+00,2.805398e+00,-1.079050e+00,3.348360e-01,9.569605e-01,1.091202e-01,...,-3.635874e-02,3.409656e-01,2.246599e-01,-2.406236e-01,-2.965887e-01,2.090489e-01,-2.117369e-01,-4.688651e-01,3.806038e-01,-5.420523e-01
5e2058312d080b22a8f79fba,4.892920e-01,-1.766028e-01,2.871989e-01,-4.322785e-01,-6.673975e-02,-2.525399e-01,5.299641e-01,-6.251204e-01,1.255586e-01,-5.548108e-01,...,-4.736571e-01,6.037907e-02,-3.938470e-03,4.647835e-01,-9.596098e-02,-7.885698e-02,2.684656e-01,-3.168026e-01,-3.510725e-01,3.750955e-02


In [12]:
sum(svd.explained_variance_ratio_)

0.753343940874106

In [17]:
def get_cat(id):
    tmp=[]
    args  = lda.transform(df_updated[pro_skills].loc[id].values.reshape(1,-1))
    for i in range(len(args[0])): 
        if args[0][i] > 0.4:
            tmp.append(i)
    return tmp

In [18]:
tmp = []
for index in df_optim.index:
    tmp.append(get_cat(index))    

In [19]:
cat = pd.Series(tmp,index=df_optim.index)

In [20]:
dict_final = {}
for i in range(13):
    tmp = []
    for index,value in zip(cat.index, cat.values):
        if i in value:
            tmp.append(index)
    dict_final[i]=tmp        

In [21]:
topics_dict

{0: 'python/ml',
 1: 'java/jee/angular',
 2: 'cloud/linux',
 3: 'php/ symfony',
 4: 'scrum/agile',
 5: 'spring java',
 6: 'c / c++ / embarque',
 7: 'testing',
 8: 'management',
 9: 'web .net',
 10: 'big data',
 11: 'networking',
 12: 'databases'}

In [None]:
"""
with open('df.pckl','wb') as file:
    dump(df_optim,file)
"""

In [150]:
"""
with open('cat.pckl','wb') as file:
    dump(cat, file)
"""

In [19]:
with open('cat_dict.pckl','wb') as file:
    dump(dict_final, file)

<h1> Partie 3 : </h1> 

<h1> Algorithme du similarité : </h1>

In [13]:
def euclid (us1, us2):
    return distance.euclidean(us1, us2)

def cosine(us1,us2):
    return cosine_similarity(us1, us2)

In [14]:
def euclid_near(id,dfu,n):
    base_user = dfu.loc[id].values
    tmp = dfu.drop(id,axis=0)
    results = []
    
    for item,index in zip(tmp.values, tmp.index):
        results.append({"id": index, "distance": euclid(base_user, item)})
    
    return sorted(results, key=lambda x:x["distance"])[:n]


def cosine_near(id,dfu,n):
    base_user = dfu.loc[id].values
    tmp = dfu.drop(id,axis=0)
    results = []
    
    for item,index in zip(tmp.values, tmp.index):
        results.append({"id": index, "distance": cosine([base_user], [item])[0][0]})
    
    return sorted(results, key=lambda x:x["distance"], reverse=True)[:n]    

<h1> Partie 4 : </h1>

<h1> testing </h1>

In [35]:
profile = "5e20582f2d080b22a8f77e62"


In [36]:
cosine_near(profile,df_optim,10)

[{'id': '5e2058312d080b22a8f79c7a', 'distance': 0.9495684422547691},
 {'id': '5e2058312d080b22a8f79cb3', 'distance': 0.9495684422547691},
 {'id': '5e2058312d080b22a8f79262', 'distance': 0.9493364748036738},
 {'id': '5e2058312d080b22a8f78eca', 'distance': 0.8912213142292954},
 {'id': '5e2058312d080b22a8f78cf7', 'distance': 0.8573812017649863},
 {'id': '5e2058312d080b22a8f797d7', 'distance': 0.8467909650825087},
 {'id': '5e2058312d080b22a8f79748', 'distance': 0.8395043919046465},
 {'id': '5e2058302d080b22a8f78a18', 'distance': 0.8383428127760214},
 {'id': '5e2058312d080b22a8f78dd2', 'distance': 0.8373907144775034},
 {'id': '5e20582f2d080b22a8f77db2', 'distance': 0.8187010589084209}]

In [38]:
print (get_info(profile))
print("results : ")

for item in cosine_near(profile,df_optim,8):
    obj = get_info(item['id'])
    print(item['id'])
    print(obj)
    open_browser(obj[0])

('https://www.linkedin.com/in/wael-dinari-b82685a0', 'Wael Dinari', 'Ingénieur Développeur web PHP | Symfony', 'Développeur symfony php')
results : 
5e2058312d080b22a8f79c7a
('https://www.linkedin.com/in/hassene-riahi', 'Hassene Riahi', 'Consultant PHP Drupal 7 et 8 / Symfony chez Business & Décision Tunisie', 'angularjs')
5e2058312d080b22a8f79cb3
('https://www.linkedin.com/in/hassene-riahi-17b06b150', 'Hassene Riahi', 'Consultant PHP Drupal 7 et 8 / Symfony chez Business & Décision Tunisie', '(Angular | AngularJs)  Business and Decision ')
5e2058312d080b22a8f79262
('https://www.linkedin.com/in/marwenhlaoui', 'Marwen Hlaoui', 'Web Developer', ' Framework Symfony (2|3)  Git ')
5e2058312d080b22a8f78eca
('https://www.linkedin.com/in/syrine-mrad-24b816155', 'Syrine MRAD', 'Développeur web sur Carte Blanche Conseil chez ITGWANA', ' FIS  Proxym ')
5e2058312d080b22a8f78cf7
('https://www.linkedin.com/in/imene-ben-salem-62243897', 'IMENE Ben salem', 'Développeuse Full Stack at EVOLUTIVE GROUP',

In [37]:
get_us_final(profile)

{'_id': ObjectId('5e9c6af5950fb59048c7c18f'),
 'id': '5e20582f2d080b22a8f77e62',
 'url': 'https://www.linkedin.com/in/wael-dinari-b82685a0',
 'search': 'Développeur symfony php',
 'skills': ['css',
  'c++',
  'html',
  'wordpress',
  'ajax',
  'drupal',
  'php',
  'javascript',
  'symfony',
  'java',
  'jquery',
  'bootstrap',
  'hibernate',
  'git',
  'jira',
  'spring',
  'codeigniter',
  'angular',
  ' c ',
  'scrum'],
 'experiences': {'skills': {'php': 70,
   'drupal': 53,
   'symfony': 70,
   'jquery': 48,
   'bootstrap': 48,
   'css': 48,
   'javascript': 48,
   'html': 48,
   'mysql': 48,
   'git': 13,
   'jira': 13,
   'codeigniter': 13},
  'companies': {'foll_mean': 37464.333333333336,
   'foll_sd': 73975.60013424847,
   'emp_mean': 1733.5,
   'emp_sd': 3698.4201018813424,
   'lk_emp_mean': 1679.1666666666667,
   'lk_emp_sd': 3688.8436497393354},
  'date_range': 5,
  'date_mean': 1.3333333333333333,
  'companies_type': {'Société cotée en bourse': 1,
   'Société civile/Société 