Dans ce notebook nous entrainerons nos données sur les restes dees facultées et ensuite continuerons après

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #for beatiful visualizations
%matplotlib inline 
import scipy.stats as scs #for statistics
import operator
from scipy.stats import chi2_contingency
import matplotlib.ticker as ticker
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import *

In [67]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

In [174]:
# %load ../codes/predictiveModelClass.py
class PredictiveModelByilding(object):
  """docstring for PredictiveModelByilding
this class will handle all pipeline for a preeictive modèle building 
in the chapter of my machine learning project ,
it will train differents modele, encode data,scale data, and so on
  """
  def __init__(self,dataset,encoderFunction):
    if isinstance(dataset,pd.DataFrame):
      self.dataset=dataset
      self.training_set=pd.DataFrame()
      self.test_set=pd.DataFrame()
      self.predictiveModels={}
      self.X_train=pd.DataFrame()
      self.X_test=pd.DataFrame()
      self.Y_train=pd.Series()
      self.Y_test=pd.Series()
      self.dataset_bin=encoderFunction(dataset,catCol=['SCHOOL_RIGHT', 'OPTION_RIGHT'],numCol=['DIPPERC','CGPA','EchecRatio'])
      self.dataset_bin.reset_index(inplace=True)
      ### init all models
      ridge_reg=Ridge(alpha=1, solver="cholesky")
      linSVM_reg=LinearSVR(dual=False,fit_intercept=False,loss='squared_epsilon_insensitive' )
      rbfSVM_reg=SVR(verbose=True)
      lasso_reg=Lasso(alpha=1e-05,max_iter=10000)
      elastic_reg=ElasticNet(alpha=1e-05,max_iter=10000,l1_ratio=1)
      self.predictiveModels[ridge_reg.__class__.__name__]=ridge_reg
      self.predictiveModels[linSVM_reg.__class__.__name__]=linSVM_reg
      self.predictiveModels[rbfSVM_reg.__class__.__name__]=rbfSVM_reg
      self.predictiveModels[lasso_reg.__class__.__name__]=lasso_reg
      self.predictiveModels[elastic_reg.__class__.__name__]=elastic_reg

    else:
      raise TypeError('need only a DataFrame')
  def scale(self,numCols):

    """this function will scale the values of GPA and DIP percentage by divide them by 100"""
    self.dataset_bin.loc[:,numCols[0]] = self.dataset_bin[numCols[0]]/100
    self.dataset_bin.loc[:,numCols[1]] = self.dataset_bin[numCols[1]]/100
  def split(self):
    """the function will split the dataset into a train and a test one" and return X_train and X_Test"""
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(self.dataset_bin, self.dataset_bin.EchecRatio):
      self.training_set=self.dataset_bin.loc[train_index]
      self.test_set=self.dataset_bin.loc[test_index]
      self.training_set.set_index(keys='ID',inplace=True)
      self.test_set.set_index(keys='ID',inplace=True)
    self.Y_train=self.training_set.CGPA
    self.X_train=self.training_set.drop(labels=['CGPA','EchecRatio'],axis=1)
    self.Y_test= self.test_set.CGPA
    self.X_test= self.test_set.drop(labels=['CGPA','EchecRatio'],axis=1)  
    return self.training_set.describe()[['DIPPERC','CGPA']],self.test_set.describe()[['DIPPERC','CGPA']]
  def train(self):
    """will train diverents models with X , Y pass in parametes"""
    predictions={}
    for clf in self.predictiveModels.values():
      clf.fit(self.X_train, self.Y_train)
      predictions[clf.__class__.__name__]= clf.predict(self.X_train)
    predictedVal=pd.DataFrame.from_dict(predictions,dtype=np.float)
    predictedVal.set_index(self.Y_train.index,inplace=True)
    predictedVal.loc[:,'RealValue']=self.Y_train
    return predictedVal
  def predictTest(self):
    """evaluate the model on test set"""
    predictions={}
    for clf in self.predictiveModels.values():
      predictions[clf.__class__.__name__]= clf.predict(self.X_test)
    predictedVal=pd.DataFrame.from_dict(predictions,dtype=np.float)
    predictedVal.set_index(self.Y_test.index,inplace=True)
    predictedVal.loc[:,'RealValue']=self.Y_test
    return predictedVal
  def evaluate (self,model,on):
    """ this function will first do a evaluation of a mdels and return the RMSE score of it and some datat and their labesl
    """
    if on=='train':
      some_data = self.X_train.iloc[:5]
      some_labels = self.Y_train.iloc[:5]
      print("Predictions:\t", self.predictiveModels[model].predict(some_data))
      print("Labels:\t\t", list(some_labels))
      CGPA_predictions = self.predictiveModels[model].predict(self.X_train)
      lin_mse = mean_squared_error(self.Y_train , CGPA_predictions)
      lin_rmse=np.sqrt(lin_mse)
      return lin_rmse
    elif on=='test':
      some_data = self.X_test.iloc[:5]
      some_labels = self.Y_test.iloc[:5]
      print("Predictions:\t", self.predictiveModels[model].predict(some_data))
      print("Labels:\t\t", list(some_labels))
      CGPA_predictions = self.predictiveModels[model].predict(self.X_test)
      lin_mse = mean_squared_error(self.Y_test , CGPA_predictions)
      lin_rmse=np.sqrt(lin_mse)
      return lin_rmse

  def crossEvaluate(self,model):
    """this one will perfom a cross validation of the model"""
    scores = cross_val_score(self.predictiveModels[model], self.X_train, self.Y_train,scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores,rmse_scores.std(),rmse_scores.mean()
  def ensembelMethods(self,predictedValues):
    """ this method will get a dataframe of predicted values by diffrents classifier and will return 
    the value compute by  a linear regression between the 3 values and RMSE
    """
    stacker= LinearRegression(normalize=True)
    stacker.fit(predictedValues.drop(labels="RealValue",axis=1), predictedValues.RealValue)
    finalPredict=stacker.predict(predictedValues.drop(labels="RealValue",axis=1))
    predictedValues.loc[:,'finalPredict']=finalPredict
    rmseEnsemble=np.sqrt(mean_squared_error(predictedValues.RealValue, finalPredict))
    return predictedValues, rmseEnsemble
  def saveModels(departement,self):
    """after all job we will save the models"""
    for reg in self.predictiveModels.values():
      name=departement+reg.__class__.__name__
      joblib.dump(reg, "../predictivesModels/"+name+".pkl")

In [18]:
def ConvertCat(dataset, catCol,numCol):
    """this function will binarize a dataset given in parametrer and return the dataset with categorical columns binarise by one-hot encoding"""
    enc= LabelBinarizer()
    X_train_1=dataset[catCol]
    X=dataset[numCol]
    catCol=X_train_1.columns
    for col in catCol:
        data=dataset[[col]]
        enc.fit(data)
        # Fitting One Hot Encoding on train data
        temp = enc.transform(dataset[[col]])
        # Changing the encoded features into a data frame with new column names
        temp=pd.DataFrame(temp,columns=[ col + '-' +i for i in data[col].value_counts().index])
        # In side by side concatenation index values should be same
        # Setting the index values similar to the X_train data frame
        temp=temp.set_index(dataset.index)
        # adding the new One Hot Encoded varibales to the train data frame
        X=pd.merge(temp,X,right_index=True,left_index=True)
    return X

In [7]:
dataset=pd.read_csv("../dataset/DatasetFinalV1.csv",index_col="Unnamed: 0")
dataset.set_index(keys='ID',inplace=True)
datasetCGPA=dataset[['DIPPERC', u'SCHOOL_RIGHT', u'OPTION_RIGHT', u'FAC' , u'CGPA','DistinctionRatio','EchecRatio','Pass1stSessionRatio']]

In [69]:
droit=datasetCGPA.loc[datasetCGPA.FAC=='FD']

In [108]:
droitModel=PredictiveModelByilding(dataset=droit,encoderFunction=ConvertCat)

In [109]:
droitModel.scale(['CGPA','DIPPERC'])

In [110]:
describeTrain,describeTest=droitModel.split()

In [111]:
droitModel.X_test.shape

(180, 297)

In [112]:
describeTest

Unnamed: 0,DIPPERC,CGPA
count,716.0,716.0
mean,0.560793,0.580058
std,0.05189,0.069535
min,0.5,0.364
25%,0.52,0.55
50%,0.55,0.591
75%,0.59,0.6215
max,0.78,0.781


In [113]:
describeTrain

Unnamed: 0,DIPPERC,CGPA
count,716.0,716.0
mean,0.560793,0.580058
std,0.05189,0.069535
min,0.5,0.364
25%,0.52,0.55
50%,0.55,0.591
75%,0.59,0.6215
max,0.78,0.781


In [114]:
predictedValues=droitModel.train()

[LibSVM]

In [79]:
droitModel.predictiveModels.keys()

['ElasticNet', 'SVR', 'LinearSVR', 'Ridge', 'Lasso']

In [115]:
predictedValues.head(10)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10709,0.56916,0.56916,0.553602,0.563864,0.554185,0.562
4026,0.533646,0.533646,0.516206,0.543356,0.550057,0.597
8307,0.548312,0.548312,0.545496,0.555175,0.569755,0.638
7713,0.649854,0.649854,0.680102,0.625937,0.556364,0.61775
8335,0.564797,0.564797,0.572836,0.560507,0.531529,0.6035
12196,0.561756,0.561756,0.560619,0.56063,0.548114,0.42
9004,0.54959,0.54959,0.546518,0.552067,0.54499,0.48
10192,0.585291,0.585291,0.582199,0.576354,0.544221,0.59
12000,0.496352,0.496352,0.482219,0.513912,0.537509,0.42
11373,0.561155,0.561155,0.579594,0.552852,0.549012,0.606


In [116]:
RMSE={}
for reg in droitModel.predictiveModels.keys():
    rmse=droitModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/droitModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.56915998,  0.53364647,  0.54831246,  0.64985428,  0.56479681]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.55418524,  0.55005711,  0.56975511,  0.55636392,  0.53152906]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.5536023 ,  0.51620636,  0.54549554,  0.68010249,  0.57283626]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.56386402,  0.54335583,  0.55517475,  0.62593662,  0.56050673]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.56915998,  0.53364647,  0.54831246,  0.64985428,  0.56479681]))
('Labels:\t\t', [0.56199998855600009, 0.597

In [117]:
RMSE

{'ElasticNet': [0.048511380800375606, 8.3608563879177407],
 'Lasso': [0.048511380800375606, 8.3608563879177407],
 'LinearSVR': [0.051960828817335478, 8.9553630585496737],
 'Ridge': [0.052126970689201822, 8.9839973358630818],
 'SVR': [0.066878676691107516, 11.526429510000717]}

Let try cross validation

In [118]:
CVScore={}
for reg in droitModel.predictiveModels.keys():
    scores,Sstd,Smean=droitModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/droitModel.dataset_bin.CGPA.mean()]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [119]:
CVScore

{'ElasticNet': [0.070013030408387078, 12.066630198308225],
 'Lasso': [0.070013030408387078, 12.066630198308225],
 'LinearSVR': [0.075690657044111043, 13.045159775127384],
 'Ridge': [0.066934950459060144, 11.536128200998506],
 'SVR': [0.068529975364412049, 11.811028110025546]}

Appres les scrores sur la validation croisée essayons de combinner les methodes par une rgression lineaire

In [54]:
from sklearn.model_selection import GridSearchCV
param_grid ={'alpha':[1e-5,1e-4, 1e-3,1e-2,0, 1, 5, 10],'selection':['cyclic','random']}
grid_search = GridSearchCV(droitModel.predictiveModels['Lasso'], param_grid, cv=5,scoring='neg_mean_squared_error',verbose=5)
grid_search.fit(droitModel.X_train,droitModel.Y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.005322, total=   0.3s
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.004703, total=   0.1s
[CV] alpha=1e-05, selection=cyclic ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ... alpha=1e-05, selection=cyclic, score=-0.005221, total=   0.3s
[CV] alpha=1e-05, selection=cyclic ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s


[CV] ... alpha=1e-05, selection=cyclic, score=-0.005055, total=   0.2s
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.004535, total=   0.1s
[CV] alpha=1e-05, selection=random ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.0s remaining:    0.0s


[CV] ... alpha=1e-05, selection=random, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.004703, total=   0.3s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.005229, total=   0.2s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.005055, total=   0.1s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.004535, total=   0.2s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .. alpha=0.0001, selection=cyclic, score=-0.004921, total=   0.0s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .. alpha=0.0001, selection=cyclic, score=-0.004540, total=   0.1s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .

  estimator.fit(X_train, y_train, **fit_params)
  positive)


[CV] ....... alpha=0, selection=cyclic, score=-0.005376, total=   4.1s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.004736, total=   5.0s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.005348, total=   5.0s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.005173, total=   4.8s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.004637, total=   4.7s
[CV] alpha=0, selection=random .......................................
[CV] ....... alpha=0, selection=random, score=-0.005372, total=   3.8s
[CV] alpha=0, selection=random .......................................
[CV] ....... alpha=0, selection=random, score=-0.004732, total=   4.9s
[CV] alpha=0, selection=random .......................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   51.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0, 1, 5, 10], 'selection': ['cyclic', 'random']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=5)

In [55]:
cvres = grid_search.cv_results_
for mean_score, params in zip(sorted(cvres["mean_test_score"],reverse=True), cvres["params"]):
    print(np.sqrt(-mean_score), params)

(0.0672890169473533, {'alpha': 1e-05, 'selection': 'cyclic'})
(0.067290284332038222, {'alpha': 1e-05, 'selection': 'random'})
(0.067831595702809055, {'alpha': 0.0001, 'selection': 'cyclic'})
(0.067831710365960446, {'alpha': 0.0001, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0.001, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0.001, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0.01, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0.01, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0, 'selection': 'random'})
(0.06957552507642617, {'alpha': 1, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 1, 'selection': 'random'})
(0.070481873651514551, {'alpha': 5, 'selection': 'cyclic'})
(0.070493598520100439, {'alpha': 5, 'selection': 'random'})
(0.071009492129496574, {'alpha': 10, 'selection': 'cyclic'})
(0.071094610279614737, {'alpha': 10, 'selection': 'random'})


Nous venons de voir qu'avec un alpha de 1e-05 notre modele dispose d'un bon score

In [56]:
from sklearn.model_selection import GridSearchCV
param_grid ={'alpha':[1e-5,1e-4, 1e-3,1e-2,0, 1, 5, 10],'selection':['cyclic','random'],'l1_ratio':[1,0.5,1.0/3.0,0.25,0.2]}
grid_search = GridSearchCV(droitModel.predictiveModels['ElasticNet'], param_grid, cv=5,scoring='neg_mean_squared_error',verbose=5)
grid_search.fit(droitModel.X_train,droitModel.Y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.004703, total=   0.1s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005221, total=   0.3s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005055, total=   0.1s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s


[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.004535, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.004703, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005173, total=   0.1s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005055, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.004530, total=   0.2s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=0.5 .....................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=0.5, score=-0.005359, total=   0.3s
[CV] alpha=1

[CV]  alpha=0.0001, selection=random, l1_ratio=1, score=-0.004118, total=   0.1s
[CV] alpha=0.0001, selection=random, l1_ratio=1 ......................
[CV]  alpha=0.0001, selection=random, l1_ratio=1, score=-0.004132, total=   0.0s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.005064, total=   0.0s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004569, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004961, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004427, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004264, total=   0.

[CV]  alpha=0.001, selection=random, l1_ratio=0.5, score=-0.003426, total=   0.0s
[CV] alpha=0.001, selection=random, l1_ratio=0.5 .....................
[CV]  alpha=0.001, selection=random, l1_ratio=0.5, score=-0.004037, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.005115, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.004749, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.005021, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.003419, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l

[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005294, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005099, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005130, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.003482, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.004218, total=   0.0s
[CV] alpha=0.01, selection=random, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=random, l1_ratio=0.25, score=-0.005294, total=   0.0s
[CV] alpha=0.01, selection=random, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=random, l1_ratio=0.25, score=-0.005098, total=   0.0s


[CV]  alpha=0, selection=cyclic, l1_ratio=0.25, score=-0.004637, total=   6.1s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005372, total=   4.1s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.004767, total=   5.0s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005285, total=   5.7s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005162, total=   5.6s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.004676, total=   5.2s
[CV] alpha=0, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=0, selection=cyclic, l1_ratio=0.2, score=-0.005376, total=   4.0s
[CV] alpha=0, selectio

[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.005463, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.005285, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.003664, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.004268, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005519, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005463, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005285, total=   0.0s
[CV] alpha=1, selection=rand

[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005519, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005463, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005285, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.003664, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.004268, total=   0.0s
[CV] alpha=10, selection=cyclic, l1_ratio=0.5 ........................
[CV]  alpha=10, selection=cyclic, l1_ratio=0.5, score=-0.005519, total=   0.0s
[CV] alpha=10, selection=cyclic, l1_ratio=0.5 ........................
[CV]  alpha=10, selection=cyclic, l1_ratio=0.5, score=-0.005463, total=   0.0s
[CV] alpha=10, selection=cyclic

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  4.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0, 1, 5, 10], 'selection': ['cyclic', 'random'], 'l1_ratio': [1, 0.5, 0.3333333333333333, 0.25, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=5)

In [57]:
cvres = grid_search.cv_results_
for mean_score, params in zip(sorted(cvres["mean_test_score"],reverse=True), cvres["params"]):
    print(np.sqrt(-mean_score), params)

(0.06667432276156228, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 1})
(0.066674559842560432, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 1})
(0.066684532520481191, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.5})
(0.066685195121775606, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.5})
(0.066752183580841659, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.3333333333333333})
(0.066752199918334462, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.3333333333333333})
(0.067288550808824205, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.25})
(0.0672890169473533, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.25})
(0.067373190313279152, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.2})
(0.067373433821402373, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.2})
(0.067827927937534477, {'alpha': 0.0001, 'selection': 'cyclic', 'l1_ratio': 1})
(0.067831595702809055, {'alpha': 0.0001, 'selection': 'random', 'l1_ratio': 1})
(0.06

Grace à ces recherches nous avons pu evaluer nos paramentres sans problèmes

effectuons une evaluation sur le test set

In [120]:
RMSE={}
for reg in droitModel.predictiveModels.keys():
    rmse=droitModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/droitModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.57183083,  0.58105636,  0.66754704,  0.58321565,  0.59821154]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.55444147,  0.54409263,  0.56064041,  0.55469773,  0.55533849]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.56192467,  0.5751141 ,  0.63554278,  0.56171098,  0.59237165]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.56940936,  0.57638618,  0.63724683,  0.57214953,  0.58017637]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.57183083,  0.58105636,  0.66754704,  0.58321565,  0.59821154]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099

In [121]:
RMSE

{'ElasticNet': [0.074235002741814035, 12.794280159433873],
 'Lasso': [0.074235002741814035, 12.794280159433873],
 'LinearSVR': [0.074014355508153801, 12.756251973004467],
 'Ridge': [0.071466206449127365, 12.317082689714084],
 'SVR': [0.072145106955878877, 12.434090071177653]}

Essayons now ls combinaison de plusieurs methodes

In [131]:
finalPredict,rmseFinal=ensembelMethods(self=droitModel,predictedValues=predictedValues)

In [132]:
finalPredict.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10709,0.56916,0.56916,0.553602,0.563864,0.554185,0.562,0.570502
4026,0.533646,0.533646,0.516206,0.543356,0.550057,0.597,0.53401
8307,0.548312,0.548312,0.545496,0.555175,0.569755,0.638,0.546321
7713,0.649854,0.649854,0.680102,0.625937,0.556364,0.61775,0.654201
8335,0.564797,0.564797,0.572836,0.560507,0.531529,0.6035,0.565973


In [133]:
rmseFinal

0.048415680580131328

In [134]:
rmseFinal*100/droitModel.dataset_bin.CGPA.mean()

8.3443626129611577

Nous venons de voir qu'on obtient  un score dans le 8.5% pres en combinant les differentes score par une regression lineaire

sur l'ensemble d'apprentissage gobale

essayons sur le test set enfin

In [135]:
predictionTest=droitModel.predictTest()

In [139]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7825,0.571831,0.571831,0.561925,0.569409,0.554441,0.5575,0.568794


In [136]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest)

In [144]:
rmseFinalTest*100/droitModel.Y_train.mean()

11.769696439697203

Nous voici arriver à la fin de l'entrainement de nos modeles en faculté de droitm'

##### 3. Facuté de santé et devllopement Communautaire

In [150]:
sante=datasetCGPA.loc[datasetCGPA.FAC=='FSDC']

In [175]:
sante.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7365,62.0,zanner,latin philo,FSDC,63.4,G,G,A
11712,54.0,zanner,latin philo,FSDC,60.0,G,G,A
11862,54.0,mululusake,sociale,FSDC,61.900002,G,G,G
7003,50.0,ibanda,bio-chimie,FSDC,62.700001,G,G,G
7354,62.0,ibanda,commmerciale et adm,FSDC,42.0,G,A,G


In [176]:
santeModel=PredictiveModelByilding(dataset=sante,encoderFunction=ConvertCat)

In [177]:
santeModel.scale(['DIPPERC','CGPA'])

In [178]:
trainDes,tesDes=santeModel.split()

In [179]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,606.0,606.0
mean,0.553841,0.589861
std,0.048872,0.066621
min,0.5,0.4
25%,0.52,0.573
50%,0.54,0.606
75%,0.58,0.631625
max,0.76,0.724333


In [180]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,152.0,152.0
mean,0.551102,0.586449
std,0.050482,0.059923
min,0.5,0.4
25%,0.51,0.56025
50%,0.54,0.600417
75%,0.58,0.628333
max,0.77,0.687333


In [181]:
predictedValues=santeModel.train()

[LibSVM]

In [182]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7417,0.633946,0.633946,0.712691,0.620563,0.556396,0.585333
8372,0.64182,0.64182,0.6406,0.640527,0.584777,0.645667
9322,0.619607,0.619607,0.587724,0.597814,0.549957,0.625667
11413,0.563453,0.563453,0.604019,0.56146,0.539789,0.574
6746,0.63094,0.63094,0.600488,0.60448,0.546782,0.637


In [183]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/santeModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.55639553,  0.58477657,  0.54995735,  0.53978909,  0.54678191]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.71269094,  0.64060031,  0.5877242 ,  0.6040193 ,  0.60048801]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.6205627 ,  0.64052699,  0.59781433,  0.56145965,  0.60448013]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62

In [184]:
RMSE

{'ElasticNet': [0.048506383027202178, 8.2329050435506357],
 'Lasso': [0.048506383027202178, 8.2329050435506357],
 'LinearSVR': [0.053311148653203064, 9.0484096573083317],
 'Ridge': [0.051086773887118013, 8.6708703503643676],
 'SVR': [0.069900647273839767, 11.864116752748179]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [185]:
CVScore={}
for reg in santeModel.predictiveModels.keys():
    scores,Sstd,Smean=santeModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/santeModel.dataset_bin.CGPA.mean()]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [186]:
CVScore

{'ElasticNet': [0.069636656330641075, 11.819310023570015],
 'Lasso': [0.069636656330641075, 11.819310023570015],
 'LinearSVR': [0.07373834926024557, 12.515483316645678],
 'Ridge': [0.065243038982058466, 11.073588900469739],
 'SVR': [0.071598311071310761, 12.152258312027383]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [187]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/santeModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.6442474 ,  0.59141372,  0.59097368,  0.60862079,  0.57945642]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.54720662,  0.54770263,  0.55064107,  0.55161345,  0.54692344]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.63506558,  0.63154406,  0.59812308,  0.64795774,  0.54048096]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.61248734,  0.58607667,  0.58714316,  0.59857347,  0.57376314]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.6442474 ,  0.59141372,  0.59097368,  0.60862079,  0.57945642]))
('Labels:\t\t', [0.65099998474100007, 0.518

In [188]:
RMSE

{'ElasticNet': [0.065266413026714515, 11.077556136906004],
 'Lasso': [0.065266413026714515, 11.077556136906004],
 'LinearSVR': [0.069746792001455005, 11.838003161733097],
 'Ridge': [0.062882899840641562, 10.673006539382214],
 'SVR': [0.067167938056348866, 11.40029871282916]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [189]:
finalPred,finalRMSE=santeModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [191]:
finalPred

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7417,0.633946,0.633946,0.620563,0.585333,0.633389
8372,0.641820,0.641820,0.640527,0.645667,0.646837
9322,0.619607,0.619607,0.597814,0.625667,0.620244
11413,0.563453,0.563453,0.561460,0.574000,0.561708
6746,0.630940,0.630940,0.604480,0.637000,0.635142
11635,0.622717,0.622717,0.621845,0.637000,0.625436
3137,0.662494,0.662494,0.655785,0.652000,0.668611
10229,0.572009,0.572009,0.571881,0.617000,0.576197
9568,0.589936,0.589936,0.587578,0.420000,0.597923
5354,0.600245,0.600245,0.604427,0.470000,0.596086


In [193]:
finalRMSE*100/santeModel.dataset_bin.CGPA.mean()

8.2338052872006031

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de 8% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [194]:
predictionTest=santeModel.predictTest()

In [195]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4895,0.644247,0.644247,0.635066,0.612487,0.547207,0.651


In [196]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [197]:
rmseFinalTest*100/santeModel.Y_train.mean()

9.9625467598016613

Nous avons un resulat de 9.9 % sur notre ensemble d'evaluation wouhhhhh........

In [202]:
saveModels(self=santeModel,departement='Sante')

##### 3. Facuté de Pscologie

In [203]:
psyco=datasetCGPA.loc[datasetCGPA.FAC=='FPSE']

In [204]:
psyco.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11347,55.0,mululusake,pedagogie,FPSE,46.0,G,A,G
7026,60.0,butembo,pedagogie,FPSE,48.950001,G,D,G
8487,57.0,butembo,pedagogie,FPSE,59.6,G,G,F
10434,59.0,butembo,pedagogie,FPSE,59.700001,G,G,A
7487,52.0,chemchem,coupe couture,FPSE,55.15,G,D,G


In [214]:
psyco.EchecRatio.value_counts()

G    180
A     30
D     12
C      5
Name: EchecRatio, dtype: int64

In [216]:
psycoModel=PredictiveModelByilding(dataset=psyco,encoderFunction=ConvertCat)

In [217]:
psycoModel.scale(['DIPPERC','CGPA'])

In [218]:
trainDes,tesDes=psycoModel.split()

In [219]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,181.0,181.0
mean,0.561264,0.597438
std,0.056302,0.079608
min,0.5,0.4
25%,0.52,0.5595
50%,0.55,0.609
75%,0.59,0.652667
max,0.77,0.78175


In [220]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,46.0,46.0
mean,0.565,0.588315
std,0.058224,0.066295
min,0.5,0.43
25%,0.51,0.5645
50%,0.56,0.601
75%,0.6,0.627
max,0.75,0.714


In [221]:
predictedValues=psycoModel.train()

[LibSVM]

In [182]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7417,0.633946,0.633946,0.712691,0.620563,0.556396,0.585333
8372,0.64182,0.64182,0.6406,0.640527,0.584777,0.645667
9322,0.619607,0.619607,0.587724,0.597814,0.549957,0.625667
11413,0.563453,0.563453,0.604019,0.56146,0.539789,0.574
6746,0.63094,0.63094,0.600488,0.60448,0.546782,0.637


In [222]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.55639553,  0.58477657,  0.54995735,  0.53978909,  0.54678191]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.71269094,  0.64060031,  0.5877242 ,  0.6040193 ,  0.60048801]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.6205627 ,  0.64052699,  0.59781433,  0.56145965,  0.60448013]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62

In [223]:
RMSE

{'ElasticNet': [0.048506383027202178, 8.1442683014338755],
 'Lasso': [0.048506383027202178, 8.1442683014338755],
 'LinearSVR': [0.053311148653203064, 8.9509930650946377],
 'Ridge': [0.051086773887118013, 8.5775184053209639],
 'SVR': [0.069900647273839767, 11.73638581798169]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [226]:
CVScore={}
for reg in santeModel.predictiveModels.keys():
    scores,Sstd,Smean=psycoModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/psycoModel.dataset_bin.CGPA.mean()]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [227]:
CVScore

{'ElasticNet': [0.078153280788430132, 13.122010911869584],
 'Lasso': [0.078153280788430132, 13.122010911869584],
 'LinearSVR': [0.082076009272530726, 13.780640792194324],
 'Ridge': [0.0735624886756783, 12.351212506608363],
 'SVR': [0.076979183941278617, 12.924878923493804]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [228]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=psycoModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.54627331,  0.62044522,  0.62659385,  0.60660261,  0.59181471]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.57635701,  0.57732127,  0.57809299,  0.57635701,  0.57635701]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.54199208,  0.62736156,  0.63131675,  0.59778709,  0.54941206]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.56774303,  0.61213123,  0.61719123,  0.60737644,  0.58738797]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.54627331,  0.62044522,  0.62659385,  0.60660261,  0.59181471]))
('Labels:\t\t', [0.46999999999999997, 0.623

In [229]:
RMSE

{'ElasticNet': [0.071372353655097823, 11.983486732992738],
 'Lasso': [0.071372353655097823, 11.983486732992738],
 'LinearSVR': [0.081538251216321092, 13.690350698031406],
 'Ridge': [0.066752480188447011, 11.207805540478766],
 'SVR': [0.06566283741951609, 11.024853472955733]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [230]:
finalPred,finalRMSE=santeModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue',u'SVR']])

In [231]:
finalPred

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,SVR,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7044,0.614691,0.614691,0.601469,0.616500,0.576164,0.615258
8518,0.708552,0.708552,0.683582,0.781750,0.627838,0.709026
12160,0.636192,0.636192,0.616022,0.638000,0.577707,0.637065
9227,0.578178,0.578178,0.572074,0.588000,0.583145,0.578105
8997,0.592350,0.592350,0.587194,0.592333,0.584850,0.592344
5031,0.592998,0.592998,0.592967,0.648000,0.583145,0.592927
9826,0.651616,0.651616,0.635800,0.634500,0.584137,0.652386
8578,0.687691,0.687691,0.645101,0.689500,0.589443,0.689239
7004,0.581252,0.581252,0.583207,0.607000,0.576357,0.581189
9947,0.681999,0.681999,0.673437,0.693000,0.620269,0.682056


In [232]:
finalRMSE*100/psycoModel.dataset_bin.CGPA.mean()

6.6010010786396887

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de 6% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [233]:
predictionTest=psycoModel.predictTest()

In [234]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9149,0.546273,0.546273,0.541992,0.567743,0.576357,0.47


In [235]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [236]:
rmseFinalTest*100/psycoModel.Y_train.mean()

10.564151678218369

Nous avons un resulat de 10.5 % sur notre ensemble d'evaluation wouhhhhh........

 10.56

Efin atterisson avec la afaculté de theologie

##### 3. Facuté de Pscologie

In [237]:
teologie=datasetCGPA.loc[datasetCGPA.FAC=='FT']

In [239]:
teologie.shape

(140, 8)

In [238]:
teologie.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8692,52.0,mululusake,pedagogie,FT,59.933333,G,G,G
3651,58.0,ibanda,commmerciale et adm,FT,63.400002,G,G,G
11729,60.0,butembo,sociale,FT,64.400002,G,G,G
7512,52.0,ndosho,pedagogie,FT,66.875,F,G,B
10686,52.0,ndosho,pedagogie,FT,57.1,G,G,G


In [244]:
teologie.EchecRatio.value_counts()

G    119
A     14
E      7
Name: EchecRatio, dtype: int64

In [243]:
teologie.EchecRatio.loc[teologie.EchecRatio.isin(['E','D','F'])]='E'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [245]:
teologieModel=PredictiveModelByilding(dataset=teologie,encoderFunction=ConvertCat)

In [246]:
teologieModel.scale(['DIPPERC','CGPA'])

In [247]:
trainDes,tesDes=teologieModel.split()

In [248]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,112.0,112.0
mean,0.5375,0.618471
std,0.039421,0.062725
min,0.5,0.41
25%,0.51,0.5915
50%,0.52,0.6205
75%,0.5525,0.66
max,0.65,0.7325


In [249]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,28.0,28.0
mean,0.540714,0.626101
std,0.043879,0.061257
min,0.5,0.43
25%,0.51,0.6065
50%,0.52,0.6335
75%,0.57,0.65425
max,0.69,0.734


In [250]:
predictedValues=teologieModel.train()

[LibSVM]

In [251]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10903,0.59112,0.59112,0.59291,0.598864,0.57806,0.59
12230,0.49112,0.49112,0.497404,0.546722,0.590417,0.49
8319,0.622738,0.622738,0.598529,0.6188,0.577988,0.604667
8692,0.600453,0.600453,0.594256,0.603376,0.578024,0.599333
11528,0.677881,0.677881,0.65485,0.643442,0.579455,0.679


In [252]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.55639553,  0.58477657,  0.54995735,  0.53978909,  0.54678191]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.71269094,  0.64060031,  0.5877242 ,  0.6040193 ,  0.60048801]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.6205627 ,  0.64052699,  0.59781433,  0.56145965,  0.60448013]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62

In [253]:
RMSE

{'ElasticNet': [0.048506383027202178, 8.1442683014338755],
 'Lasso': [0.048506383027202178, 8.1442683014338755],
 'LinearSVR': [0.053311148653203064, 8.9509930650946377],
 'Ridge': [0.051086773887118013, 8.5775184053209639],
 'SVR': [0.069900647273839767, 11.73638581798169]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [254]:
CVScore={}
for reg in santeModel.predictiveModels.keys():
    scores,Sstd,Smean=psycoModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/psycoModel.dataset_bin.CGPA.mean()]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [255]:
CVScore

{'ElasticNet': [0.078153280788430132, 13.122010911869584],
 'Lasso': [0.078153280788430132, 13.122010911869584],
 'LinearSVR': [0.082076009272530726, 13.780640792194324],
 'Ridge': [0.0735624886756783, 12.351212506608363],
 'SVR': [0.076979183941278617, 12.924878923493804]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [256]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=psycoModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.54627331,  0.62044522,  0.62659385,  0.60660261,  0.59181471]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.57635701,  0.57732127,  0.57809299,  0.57635701,  0.57635701]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.54199208,  0.62736156,  0.63131675,  0.59778709,  0.54941206]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.56774303,  0.61213123,  0.61719123,  0.60737644,  0.58738797]))
('Labels:\t\t', [0.46999999999999997, 0.62300000508599995, 0.45000000000000001, 0.58999999999999997, 0.59549999237100004])
('Predictions:\t', array([ 0.54627331,  0.62044522,  0.62659385,  0.60660261,  0.59181471]))
('Labels:\t\t', [0.46999999999999997, 0.623

In [257]:
RMSE

{'ElasticNet': [0.071372353655097823, 11.983486732992738],
 'Lasso': [0.071372353655097823, 11.983486732992738],
 'LinearSVR': [0.081538251216321092, 13.690350698031406],
 'Ridge': [0.066752480188447011, 11.207805540478766],
 'SVR': [0.06566283741951609, 11.024853472955733]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [258]:
finalPred,finalRMSE=teologieModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [259]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10903,0.59112,0.59112,0.598864,0.59,0.590937
12230,0.49112,0.49112,0.546722,0.49,0.489354
8319,0.622738,0.622738,0.6188,0.604667,0.622894
8692,0.600453,0.600453,0.603376,0.599333,0.600434
11528,0.677881,0.677881,0.643442,0.679,0.679101


In [260]:
finalRMSE*100/teologieModel.dataset_bin.CGPA.mean()

5.3283072731747767

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de R% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [261]:
predictionTest=teologieModel.predictTest()

In [262]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6887,0.627435,0.627435,0.651313,0.626228,0.578113,0.7025


In [265]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=teologieModel,predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [267]:
rmseFinalTest*100/teologieModel.Y_test.mean()

9.9963180339825772

Nous avons un resulat de 10.11 % sur notre ensemble d'evaluation wouhhhhh........

 10.56