In [1]:
import pandas  as pd #Data manipulation
from pymongo import MongoClient
import yaml

# lib ML
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# Importation des données

In [2]:
# Connection MongoDB
with open('config.yaml','r') as f:
    config = yaml.safe_load(f)

In [3]:
mg = config['MONGODB_TEST'] # Changer ici pour TEST/PROD
connexion = f"mongodb://{mg['ip']}:{mg['port']}"
client = MongoClient(connexion)
db = client[mg['client']]
collec = db[mg['db']]

In [4]:
df = pd.DataFrame(list(collec.find()))
#df = pd.read_csv('results.csv', encoding = "UTF-8")

In [5]:
df.head()

Unnamed: 0,_id,job_title,company,job_location,clean_location,salary,per_hour,per_day,per_week,per_month,per_year,min_salary,max_salary,mean_salary,monthly_salary,annual_salary,summary,post_date
0,3150bb96f5dc39e7,Software Engineer - AR / VR (London),Oculus,Télétravail,,,0,0,0,0,0,0,0,0.0,0,0,We are looking for those with the ability to b...,Postedil y a 30+ jours
1,9528bcde2fb94a69,#AGGP – Graduate On Board Computer Simulation ...,Airbus,31300 Toulouse,,,0,0,0,0,0,0,0,0.0,0,0,You will be on a development pathway to prepar...,Postedil y a 2 jours
2,546e7b016e55a5ad,Product Manager,Facebook App,Télétravail,,,0,0,0,0,0,0,0,0.0,0,0,The Platform and Developer tools PM's are resp...,Postedil y a 5 jours
3,0c783b1550c11546,Business Developer (H/F),MICHELIN,Clermont-Ferrand (63),Clermont,37 000 € - 55 000 € par an,0,0,0,0,1,37000,55000,46000.0,0,0,En charge d'un portefeuille clients de 10 à 15...,Postedil y a 10 jours
4,c4c7fd2b6af9e98d,Business Developer H/F,OUIHELP,Tours (37),Tours,32 000 € - 40 000 € par an,0,0,0,0,1,32000,40000,36000.0,0,0,Prospecter et constituer un portefeuille de no...,EmployerDernière activité : il y a 14 jours


In [6]:
df.shape

(6534, 18)

# Preprocessing

In [7]:
#df.drop(['_id','summary,'], axis=1, inplace=True)
df = df.filter(['clean_location', 'mean_salary'])

In [8]:
df.head()

Unnamed: 0,clean_location,mean_salary
0,,0.0
1,,0.0
2,,0.0
3,Clermont,46000.0
4,Tours,36000.0


In [9]:
df = df[df['mean_salary'] != 0]
#df[['bought_apples','bought_pears']].sum(axis=1) > 0

In [10]:
df

Unnamed: 0,clean_location,mean_salary
3,Clermont,46000.0
4,Tours,36000.0
10,Paris,32500.0
12,Nantes,35.5
15,Tours lieu,37500.0
...,...,...
6501,Charenton,600.0
6505,Tremblay,41000.0
6513,Télétravail lyon,27500.0
6528,Télétravail mayenne,35500.0


Colonne catégorie

In [11]:
categorical_columns = ['clean_location']
all_columns = ['mean_salary']#df.columns

In [12]:
categorical_columns

['clean_location']

Features preprocessing

In [13]:
X = df.drop(columns=['mean_salary'])
X.head()

Unnamed: 0,clean_location
3,Clermont
4,Tours
10,Paris
12,Nantes
15,Tours lieu


In [14]:
y = df['mean_salary']
y.head()

3     46000.0
4     36000.0
10    32500.0
12       35.5
15    37500.0
Name: mean_salary, dtype: float64

In [15]:
class transformToPredict(BaseEstimator,TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y =None):
        return self
    
    def transform(self, X, y =None):
        
        if not isinstance(X, pd.DataFrame):
            # Creation d'un dataframe
            dic = {'clean_location': X}
            df = pd.DataFrame(dic, index=[0]) 
        else:
            df = X.copy()
   
        return df

In [16]:
# 1er pipeline pour traitement du df
transfo_to_df = Pipeline(steps=[
    ('base', transformToPredict)
])

In [17]:
transfo_cat = Pipeline(steps=[
    #('imputation', SimpleImputer(strategy='most_frequent')), Donnees manquantes ? a voir
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

In [18]:
#transfo_df = Pipeline(steps=[
#    ('scaling', MinMaxScaler())
#])

In [19]:
preparation = ColumnTransformer(
    transformers=[
        ('data_cat', transfo_cat , categorical_columns)
        #('data_df', transfo_df , all_columns)
    ])

# Model

In [20]:
model = LinearRegression()
pipe_model = Pipeline(steps=[
    ('base',transformToPredict()),
    ('preparation', preparation),
    ('model',model)])
pipe_model

Pipeline(steps=[('base', transformToPredict()),
                ('preparation',
                 ColumnTransformer(transformers=[('data_cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['clean_location'])])),
                ('model', LinearRegression())])

# Traint test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2) #stratify=y

In [22]:
pipe_model.fit(X_train,y_train)

Pipeline(steps=[('base', transformToPredict()),
                ('preparation',
                 ColumnTransformer(transformers=[('data_cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['clean_location'])])),
                ('model', LinearRegression())])

# Model evaluation

In [23]:
pipe_model.score(X_train, y_train)

0.3574568407359783

In [28]:
pipe_model.predict('Nantes')

array([18118.])