Dans ce notebook nous entrainerons nos données sur les restes dees facultées et ensuite continuerons après

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #for beatiful visualizations
%matplotlib inline 
import scipy.stats as scs #for statistics
import operator
from scipy.stats import chi2_contingency
import matplotlib.ticker as ticker
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import *

In [3]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import LinearSVR,SVR
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

In [6]:
# %load ../GradePredictorApp/codes/PredictiveModelBuilding.py
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.svm import LinearSVR,SVR
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

class PredictiveModelBuilding(object):
    """

    docstring for PredictiveModelByilding
    this class will handle all pipeline for a preeictive modele building
    in the chapter of my machine learning project ,
    it will train differents modele, encode data,scale data, and so on

    """
    def __init__(self, dataset, encoderFunction):

        if isinstance(dataset, pd.DataFrame):
            self.dataset = dataset
            self.training_set = pd.DataFrame()
            self.test_set = pd.DataFrame()
            self.predictive_models = {}
            self.x_train = pd.DataFrame()
            self.x_test = pd.DataFrame()
            self.y_train = pd.Series()
            self.y_test = pd.Series()
            self.dataset_bin, self.encoders=encoderFunction(dataset, catCol=['SCHOOL_RIGHT', 'OPTION_RIGHT'], numCol=['DIPPERC', 'CGPA', 'EchecRatio'])
            self.dataset_bin.reset_index(inplace=True)
            ridge_reg = Ridge(alpha=1, solver="cholesky", fit_intercept=False)
            linSVM_reg = LinearSVR(dual=False, fit_intercept=False,loss='squared_epsilon_insensitive')
            rbfSVM_reg = SVR(verbose=True)
            lasso_reg = Lasso(alpha=1e-05, max_iter=10000, fit_intercept=False)
            elastic_reg = ElasticNet(alpha=1e-05, max_iter=10000, l1_ratio=0.5)
            self.predictive_models[ridge_reg.__class__.__name__] = ridge_reg
            self.predictive_models[linSVM_reg.__class__.__name__] = linSVM_reg
            self.predictive_models[rbfSVM_reg.__class__.__name__] = rbfSVM_reg
            self.predictive_models[lasso_reg.__class__.__name__] = lasso_reg
            self.predictive_models[elastic_reg.__class__.__name__] = elastic_reg
            self.stacker = LinearRegression(normalize=True)
        else:
            raise TypeError('need only a DataFrame')

    def scale(self, num_cols):

        """
        this function will scale the values of GPA and DIP percentage
        by divide them by 100

        """
        self.dataset_bin.loc[:, num_cols[0]] = self.dataset_bin[num_cols[0]]/100
        self.dataset_bin.loc[:, num_cols[1]] = self.dataset_bin[num_cols[1]]/100

    def split(self):

        """
        the function will split the dataset into a train and a test one"
         and return x_train and x_Test

        """
        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=4)
        for train_index, test_index in split.split(self.dataset_bin, self.dataset_bin.EchecRatio):
            self.training_set = self.dataset_bin.loc[train_index]
            self.test_set = self.dataset_bin.loc[test_index]
        self.training_set.set_index(keys='ID', inplace=True)
        self.test_set.set_index(keys='ID', inplace=True)
        self.x_train = self.training_set.CGPA
        self.x_train = self.training_set.drop(labels=['CGPA', 'EchecRatio'], axis=1)
        self.y_test = self.test_set.CGPA
        self.x_test = self.test_set.drop(labels=['CGPA', 'EchecRatio'], axis=1)
        train_descrption = self.training_set.describe()[['DIPPERC', 'CGPA']]
        test_description = self.test_set.describe()[['DIPPERC', 'CGPA']]
        return train_descrption, test_description

    def train(self):
        """will train diverents models with x , y pass in parametes"""
        predictions = {}
        for clf in self.predictive_models.values():
            clf.fit(self.x_train, self.y_train)
            predictions[clf.__class__.__name__] = clf.predict(self.x_train)
        predicted_values = pd.DataFrame.from_dict(predictions, dtype=np.float)
        predicted_values.set_index(self.y_train.index, inplace=True)
        predicted_values.loc[:, 'RealValue'] = self.y_train
        return predicted_values

    def predict_test(self):
        """predict values from the test set"""

        predictions = {}
        for clf in self.predictive_models.values():
            predictions[clf.__class__.__name__] = clf.predict(self.x_test)
        predicted_values = pd.DataFrame.from_dict(predictions, dtype=np.float)
        predicted_values.set_index(self.y_test.index, inplace=True)
        predicted_values.loc[:, 'RealValue'] = self.y_test
        return predicted_values

    def predict_new(self, new_student_data):
        """
        this call will handle predictions for new values,
        but frirst it will endcode them nand then try to predict
        start first by handling categorical values

        """

        option_encoder = self.encoders['OPTION_RIGHT']
        school_encoder = self.encoders['SCHOOL_RIGHT']
        options = pd.DataFrame(
        data=dict(zip(option_encoder.classes_, option_encoder.transform(new_student_data[['OPTION_RIGHT']])[0])),
            index=new_student_data.index, columns=option_encoder.classes_)
        schools = pd.DataFrame(data=dict(zip(school_encoder.classes_, school_encoder.transform(new_student_data[['SCHOOL_RIGHT']])[0])),index=new_student_data.index, columns=school_encoder.classes_)
        schools.reset_index(inplace=True)
        options.reset_index(inplace=True)
        new_dataset = pd.merge(options, schools, on='index')
        new_dataset['DIPPERC'] = new_student_data['DIPPERC']
        new_dataset.set_index(keys=['index'], inplace=True)
        predictions = {}
        for clf in self.predictive_models.values():
            predictions[clf.__class__.__name__] = clf.predict(new_dataset)
        predicted_values = pd.DataFrame.from_dict(predictions, dtype=np.float)
        predicted_values.set_index(new_dataset.index, inplace=True)
        predicted_values.loc[:, 'finalOutput'] = self.stacker.predict(predicted_values)
        return predicted_values

    def evaluate(self, model, sur):
        """

        this function will first do a evaluation of a model and return
        the RMSE score of it and some data and their labels the function
        can evaluate on trainset and also on test_set

        """
        if sur == 'train':
            some_data = self.x_train.iloc[:5]
            some_labels = self.y_train.iloc[:5]
            print("Predictions:\t", self.predictive_models[model].predict(some_data))
            print("Labels:\t\t", list(some_labels))
            cgpa_predictions = self.predictive_models[model].predict(self.x_train)
            lin_mse = mean_squared_error(self.y_train, cgpa_predictions)
            lin_rmse = np.sqrt(lin_mse)
            return lin_rmse
        elif sur == 'test':
            some_data = self.x_test.iloc[:5]
            some_labels = self.y_test.iloc[:5]
            print("Predictions:\t", self.predictive_models[model].predict(some_data))
            print("Labels:\t\t", list(some_labels))
            cgpa_predictions = self.predictive_models[model].predict(self.x_test)
            lin_mse = mean_squared_error(self.y_test, cgpa_predictions)
            lin_rmse = np.sqrt(lin_mse)
            return lin_rmse

    def cross_evaluate(self, model):
        """this one will perfom a cross validation of the model"""
        scores = cross_val_score(self.predictive_models[model], self.x_train, self.y_train, scoring="neg_mean_squared_error", cv=10)
        rmse_scores = np.sqrt(-scores)
        return rmse_scores, rmse_scores.std(), rmse_scores.mean()

    def ensemble_methods(self, predicted_values):
        """
        this method will get a dataframe of predicted values by diffrents classifier and will return
        the value compute by  a linear regression between the 3 values and RMSE
        """
        x_new = predicted_values.drop(labels="RealValue", axis=1)
        y_new = predicted_values.RealValue
        self.stacker.fit(x_new, y_new)
        final_predict = self.stacker.predict(predicted_values.drop(labels="RealValue", axis=1))
        predicted_values.loc[:, 'finalPredict'] = final_predict
        rmse_ensemble = np.sqrt(mean_squared_error(predicted_values.RealValue, final_predict))
        return predicted_values, rmse_ensemble

    def save_models(self, departement):
        """

        after all job we will save the class with the models for
        deployement

        """
        joblib.dump(self, "../predictivesModels/Classes/"+departement+".pkl")


In [7]:
# %load ../codes/convertCat.py
def ConvertCat(dataset, catCol,numCol):
    """this function will binarize a dataset given in parametrer and return the dataset with categorical columns binarise by one-hot encoding"""
    encs={}
    X_train_1=dataset[catCol]
    X=dataset[numCol]
    catCol=X_train_1.columns
    for col in catCol:
        data=dataset[[col]]
        enc= LabelBinarizer()
        enc.fit(data)
        # Fitting One Hot Encoding on train data
        temp = enc.transform(dataset[[col]])
        # Changing the encoded features into a data frame with new column names
        temp=pd.DataFrame(temp,columns=enc.classes_)
        # In side by side concatenation index values should be same
        # Setting the index values similar to the X_train data frame
        temp=temp.set_index(dataset.index)
        # adding the new One Hot Encoded varibales to the train data frame
        
        X=pd.merge(temp,X,right_index=True,left_index=True)
        #saving the encoder into a dict for others operations
        encs[col]=enc
    return X,encs

In [30]:
dataset=pd.read_csv("../dataset/DatasetFinalV1.csv",index_col="Unnamed: 0")
dataset.set_index(keys='ID',inplace=True)
datasetCGPA=dataset[['DIPPERC', u'SCHOOL_RIGHT', u'OPTION_RIGHT', u'FAC' , u'CGPA','DistinctionRatio','EchecRatio','Pass1stSessionRatio']]

In [12]:
droit=datasetCGPA.loc[datasetCGPA.FAC=='FD']

In [13]:
droit.shape

(896, 8)

In [14]:
droitModel=PredictiveModelByilding(dataset=droit,encoderFunction=ConvertCat)

In [15]:
droitModel.scale(['CGPA','DIPPERC'])

In [16]:
describeTrain,describeTest=droitModel.split()

In [17]:
droitModel.dataset_bin.shape

(896, 300)

In [112]:
describeTest

Unnamed: 0,DIPPERC,CGPA
count,716.0,716.0
mean,0.560793,0.580058
std,0.05189,0.069535
min,0.5,0.364
25%,0.52,0.55
50%,0.55,0.591
75%,0.59,0.6215
max,0.78,0.781


In [113]:
describeTrain

Unnamed: 0,DIPPERC,CGPA
count,716.0,716.0
mean,0.560793,0.580058
std,0.05189,0.069535
min,0.5,0.364
25%,0.52,0.55
50%,0.55,0.591
75%,0.59,0.6215
max,0.78,0.781


In [18]:
predictedValues=droitModel.train()

[LibSVM]

In [19]:
droitModel.predictiveModels.keys()

['ElasticNet', 'SVR', 'LinearSVR', 'Ridge', 'Lasso']

In [20]:
predictedValues.head(10)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10709,0.565598,0.56916,0.553602,0.563864,0.554185,0.562
4026,0.532851,0.533646,0.516206,0.543356,0.550057,0.597
8307,0.547631,0.548312,0.545496,0.555175,0.569755,0.638
7713,0.651608,0.649854,0.680102,0.625937,0.556364,0.61775
8335,0.56466,0.564797,0.572836,0.560507,0.531529,0.6035
12196,0.561325,0.561756,0.560619,0.56063,0.548114,0.42
9004,0.548454,0.54959,0.546518,0.552067,0.54499,0.48
10192,0.587355,0.585291,0.582199,0.576354,0.544221,0.59
12000,0.494826,0.496352,0.482219,0.513912,0.537509,0.42
11373,0.561499,0.561155,0.579594,0.552852,0.549012,0.606


In [30]:
RMSE={}
for reg in droitModel.predictiveModels.keys():
    rmse=droitModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/droitModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.56559796,  0.53285129,  0.54763106,  0.65160816,  0.56465962]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.55418524,  0.55005711,  0.56975511,  0.55636392,  0.53152906]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.5536023 ,  0.51620636,  0.54549554,  0.68010249,  0.57283626]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.56386402,  0.54335583,  0.55517475,  0.62593662,  0.56050673]))
('Labels:\t\t', [0.56199998855600009, 0.59700000762899996, 0.63799999237100002, 0.61774999618500004, 0.60350000381499991])
('Predictions:\t', array([ 0.56915998,  0.53364647,  0.54831246,  0.64985428,  0.56479681]))
('Labels:\t\t', [0.56199998855600009, 0.597

In [31]:
RMSE

{'ElasticNet': [0.04841208301119692, 8.3437425778949539],
 'Lasso': [0.048511380800375606, 8.3608563879177407],
 'LinearSVR': [0.051960828817335478, 8.9553630585496737],
 'Ridge': [0.052126970689201822, 8.9839973358630818],
 'SVR': [0.066878676691107516, 11.526429510000717]}

Let try cross validation

In [32]:
CVScore={}
for reg in droitModel.predictiveModels.keys():
    scores,Sstd,Smean=droitModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/droitModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [33]:
CVScore

{'ElasticNet': [0.070306297300685189,
  12.117174263007708,
  0.0035678321563961833],
 'Lasso': [0.070013030408387078, 12.066630198308225, 0.0036226582256641067],
 'LinearSVR': [0.075690657044111043,
  13.045159775127384,
  0.0063763240227957419],
 'Ridge': [0.066934950459060144, 11.536128200998506, 0.0045464736648769917],
 'SVR': [0.068529975364412049, 11.811028110025546, 0.0064242800865891006]}

Appres les scrores sur la validation croisée essayons de combinner les methodes par une rgression lineaire

In [54]:
from sklearn.model_selection import GridSearchCV
param_grid ={'alpha':[1e-5,1e-4, 1e-3,1e-2,0, 1, 5, 10],'selection':['cyclic','random']}
grid_search = GridSearchCV(droitModel.predictiveModels['Lasso'], param_grid, cv=5,scoring='neg_mean_squared_error',verbose=5)
grid_search.fit(droitModel.X_train,droitModel.Y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.005322, total=   0.3s
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.004703, total=   0.1s
[CV] alpha=1e-05, selection=cyclic ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ... alpha=1e-05, selection=cyclic, score=-0.005221, total=   0.3s
[CV] alpha=1e-05, selection=cyclic ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s


[CV] ... alpha=1e-05, selection=cyclic, score=-0.005055, total=   0.2s
[CV] alpha=1e-05, selection=cyclic ...................................
[CV] ... alpha=1e-05, selection=cyclic, score=-0.004535, total=   0.1s
[CV] alpha=1e-05, selection=random ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.0s remaining:    0.0s


[CV] ... alpha=1e-05, selection=random, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.004703, total=   0.3s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.005229, total=   0.2s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.005055, total=   0.1s
[CV] alpha=1e-05, selection=random ...................................
[CV] ... alpha=1e-05, selection=random, score=-0.004535, total=   0.2s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .. alpha=0.0001, selection=cyclic, score=-0.004921, total=   0.0s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .. alpha=0.0001, selection=cyclic, score=-0.004540, total=   0.1s
[CV] alpha=0.0001, selection=cyclic ..................................
[CV] .

  estimator.fit(X_train, y_train, **fit_params)
  positive)


[CV] ....... alpha=0, selection=cyclic, score=-0.005376, total=   4.1s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.004736, total=   5.0s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.005348, total=   5.0s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.005173, total=   4.8s
[CV] alpha=0, selection=cyclic .......................................
[CV] ....... alpha=0, selection=cyclic, score=-0.004637, total=   4.7s
[CV] alpha=0, selection=random .......................................
[CV] ....... alpha=0, selection=random, score=-0.005372, total=   3.8s
[CV] alpha=0, selection=random .......................................
[CV] ....... alpha=0, selection=random, score=-0.004732, total=   4.9s
[CV] alpha=0, selection=random .......................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   51.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0, 1, 5, 10], 'selection': ['cyclic', 'random']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=5)

In [55]:
cvres = grid_search.cv_results_
for mean_score, params in zip(sorted(cvres["mean_test_score"],reverse=True), cvres["params"]):
    print(np.sqrt(-mean_score), params)

(0.0672890169473533, {'alpha': 1e-05, 'selection': 'cyclic'})
(0.067290284332038222, {'alpha': 1e-05, 'selection': 'random'})
(0.067831595702809055, {'alpha': 0.0001, 'selection': 'cyclic'})
(0.067831710365960446, {'alpha': 0.0001, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0.001, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0.001, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0.01, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0.01, 'selection': 'random'})
(0.06957552507642617, {'alpha': 0, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 0, 'selection': 'random'})
(0.06957552507642617, {'alpha': 1, 'selection': 'cyclic'})
(0.06957552507642617, {'alpha': 1, 'selection': 'random'})
(0.070481873651514551, {'alpha': 5, 'selection': 'cyclic'})
(0.070493598520100439, {'alpha': 5, 'selection': 'random'})
(0.071009492129496574, {'alpha': 10, 'selection': 'cyclic'})
(0.071094610279614737, {'alpha': 10, 'selection': 'random'})


Nous venons de voir qu'avec un alpha de 1e-05 notre modele dispose d'un bon score

In [56]:
from sklearn.model_selection import GridSearchCV
param_grid ={'alpha':[1e-5,1e-4, 1e-3,1e-2,0, 1, 5, 10],'selection':['cyclic','random'],'l1_ratio':[1,0.5,1.0/3.0,0.25,0.2]}
grid_search = GridSearchCV(droitModel.predictiveModels['ElasticNet'], param_grid, cv=5,scoring='neg_mean_squared_error',verbose=5)
grid_search.fit(droitModel.X_train,droitModel.Y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.004703, total=   0.1s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005221, total=   0.3s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.005055, total=   0.1s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=1 .......................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s


[CV]  alpha=1e-05, selection=cyclic, l1_ratio=1, score=-0.004535, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005322, total=   0.4s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.004703, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005173, total=   0.1s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.005055, total=   0.2s
[CV] alpha=1e-05, selection=random, l1_ratio=1 .......................
[CV]  alpha=1e-05, selection=random, l1_ratio=1, score=-0.004530, total=   0.2s
[CV] alpha=1e-05, selection=cyclic, l1_ratio=0.5 .....................
[CV]  alpha=1e-05, selection=cyclic, l1_ratio=0.5, score=-0.005359, total=   0.3s
[CV] alpha=1

[CV]  alpha=0.0001, selection=random, l1_ratio=1, score=-0.004118, total=   0.1s
[CV] alpha=0.0001, selection=random, l1_ratio=1 ......................
[CV]  alpha=0.0001, selection=random, l1_ratio=1, score=-0.004132, total=   0.0s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.005064, total=   0.0s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004569, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004961, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004427, total=   0.1s
[CV] alpha=0.0001, selection=cyclic, l1_ratio=0.5 ....................
[CV]  alpha=0.0001, selection=cyclic, l1_ratio=0.5, score=-0.004264, total=   0.

[CV]  alpha=0.001, selection=random, l1_ratio=0.5, score=-0.003426, total=   0.0s
[CV] alpha=0.001, selection=random, l1_ratio=0.5 .....................
[CV]  alpha=0.001, selection=random, l1_ratio=0.5, score=-0.004037, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.005115, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.004749, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.005021, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l1_ratio=0.333333333333, score=-0.003419, total=   0.0s
[CV] alpha=0.001, selection=cyclic, l1_ratio=0.333333333333 ..........
[CV]  alpha=0.001, selection=cyclic, l

[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005294, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005099, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.005130, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.003482, total=   0.0s
[CV] alpha=0.01, selection=cyclic, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=cyclic, l1_ratio=0.25, score=-0.004218, total=   0.0s
[CV] alpha=0.01, selection=random, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=random, l1_ratio=0.25, score=-0.005294, total=   0.0s
[CV] alpha=0.01, selection=random, l1_ratio=0.25 .....................
[CV]  alpha=0.01, selection=random, l1_ratio=0.25, score=-0.005098, total=   0.0s


[CV]  alpha=0, selection=cyclic, l1_ratio=0.25, score=-0.004637, total=   6.1s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005372, total=   4.1s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.004767, total=   5.0s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005285, total=   5.7s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.005162, total=   5.6s
[CV] alpha=0, selection=random, l1_ratio=0.25 ........................
[CV]  alpha=0, selection=random, l1_ratio=0.25, score=-0.004676, total=   5.2s
[CV] alpha=0, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=0, selection=cyclic, l1_ratio=0.2, score=-0.005376, total=   4.0s
[CV] alpha=0, selectio

[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.005463, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.005285, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.003664, total=   0.0s
[CV] alpha=1, selection=cyclic, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=cyclic, l1_ratio=0.2, score=-0.004268, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005519, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005463, total=   0.0s
[CV] alpha=1, selection=random, l1_ratio=0.2 .........................
[CV]  alpha=1, selection=random, l1_ratio=0.2, score=-0.005285, total=   0.0s
[CV] alpha=1, selection=rand

[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005519, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005463, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.005285, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.003664, total=   0.0s
[CV] alpha=10, selection=random, l1_ratio=1 ..........................
[CV]  alpha=10, selection=random, l1_ratio=1, score=-0.004268, total=   0.0s
[CV] alpha=10, selection=cyclic, l1_ratio=0.5 ........................
[CV]  alpha=10, selection=cyclic, l1_ratio=0.5, score=-0.005519, total=   0.0s
[CV] alpha=10, selection=cyclic, l1_ratio=0.5 ........................
[CV]  alpha=10, selection=cyclic, l1_ratio=0.5, score=-0.005463, total=   0.0s
[CV] alpha=10, selection=cyclic

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  4.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0, 1, 5, 10], 'selection': ['cyclic', 'random'], 'l1_ratio': [1, 0.5, 0.3333333333333333, 0.25, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=5)

In [57]:
cvres = grid_search.cv_results_
for mean_score, params in zip(sorted(cvres["mean_test_score"],reverse=True), cvres["params"]):
    print(np.sqrt(-mean_score), params)

(0.06667432276156228, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 1})
(0.066674559842560432, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 1})
(0.066684532520481191, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.5})
(0.066685195121775606, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.5})
(0.066752183580841659, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.3333333333333333})
(0.066752199918334462, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.3333333333333333})
(0.067288550808824205, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.25})
(0.0672890169473533, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.25})
(0.067373190313279152, {'alpha': 1e-05, 'selection': 'cyclic', 'l1_ratio': 0.2})
(0.067373433821402373, {'alpha': 1e-05, 'selection': 'random', 'l1_ratio': 0.2})
(0.067827927937534477, {'alpha': 0.0001, 'selection': 'cyclic', 'l1_ratio': 1})
(0.067831595702809055, {'alpha': 0.0001, 'selection': 'random', 'l1_ratio': 1})
(0.06

Grace à ces recherches nous avons pu evaluer nos paramentres sans problèmes

effectuons une evaluation sur le test set

In [120]:
RMSE={}
for reg in droitModel.predictiveModels.keys():
    rmse=droitModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/droitModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.57183083,  0.58105636,  0.66754704,  0.58321565,  0.59821154]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.55444147,  0.54409263,  0.56064041,  0.55469773,  0.55533849]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.56192467,  0.5751141 ,  0.63554278,  0.56171098,  0.59237165]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.56940936,  0.57638618,  0.63724683,  0.57214953,  0.58017637]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099998474099992])
('Predictions:\t', array([ 0.57183083,  0.58105636,  0.66754704,  0.58321565,  0.59821154]))
('Labels:\t\t', [0.5575, 0.54700000762899992, 0.41999999999999998, 0.55000000000000004, 0.61099

In [121]:
RMSE

{'ElasticNet': [0.074235002741814035, 12.794280159433873],
 'Lasso': [0.074235002741814035, 12.794280159433873],
 'LinearSVR': [0.074014355508153801, 12.756251973004467],
 'Ridge': [0.071466206449127365, 12.317082689714084],
 'SVR': [0.072145106955878877, 12.434090071177653]}

Essayons now ls combinaison de plusieurs methodes

In [131]:
finalPredict,rmseFinal=ensembelMethods(self=droitModel,predictedValues=predictedValues)

In [132]:
finalPredict.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10709,0.56916,0.56916,0.553602,0.563864,0.554185,0.562,0.570502
4026,0.533646,0.533646,0.516206,0.543356,0.550057,0.597,0.53401
8307,0.548312,0.548312,0.545496,0.555175,0.569755,0.638,0.546321
7713,0.649854,0.649854,0.680102,0.625937,0.556364,0.61775,0.654201
8335,0.564797,0.564797,0.572836,0.560507,0.531529,0.6035,0.565973


In [133]:
rmseFinal

0.048415680580131328

In [134]:
rmseFinal*100/droitModel.dataset_bin.CGPA.mean()

8.3443626129611577

Nous venons de voir qu'on obtient  un score dans le 8.5% pres en combinant les differentes score par une regression lineaire

sur l'ensemble d'apprentissage gobale

essayons sur le test set enfin

In [135]:
predictionTest=droitModel.predictTest()

In [139]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7825,0.571831,0.571831,0.561925,0.569409,0.554441,0.5575,0.568794


In [136]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest)

In [144]:
rmseFinalTest*100/droitModel.Y_train.mean()

11.769696439697203

Nous voici arriver à la fin de l'entrainement de nos modeles en faculté de droitm'

##### 3. Facuté de santé et devllopement Communautaire

In [36]:
sante=datasetCGPA.loc[datasetCGPA.FAC=='FSDC']

In [175]:
sante.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7365,62.0,zanner,latin philo,FSDC,63.4,G,G,A
11712,54.0,zanner,latin philo,FSDC,60.0,G,G,A
11862,54.0,mululusake,sociale,FSDC,61.900002,G,G,G
7003,50.0,ibanda,bio-chimie,FSDC,62.700001,G,G,G
7354,62.0,ibanda,commmerciale et adm,FSDC,42.0,G,A,G


In [37]:
santeModel=PredictiveModelByilding(dataset=sante,encoderFunction=ConvertCat)

In [38]:
santeModel.scale(['DIPPERC','CGPA'])

In [40]:
santeModel.dataset_bin.shape

(758, 257)

In [39]:
trainDes,tesDes=santeModel.split()

In [179]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,606.0,606.0
mean,0.553841,0.589861
std,0.048872,0.066621
min,0.5,0.4
25%,0.52,0.573
50%,0.54,0.606
75%,0.58,0.631625
max,0.76,0.724333


In [180]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,152.0,152.0
mean,0.551102,0.586449
std,0.050482,0.059923
min,0.5,0.4
25%,0.51,0.56025
50%,0.54,0.600417
75%,0.58,0.628333
max,0.77,0.687333


In [48]:
predictedValues=santeModel.train()

[LibSVM]

In [49]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7417,0.634252,0.633946,0.712691,0.620563,0.556396,0.585333
8372,0.642737,0.64182,0.6406,0.640527,0.584777,0.645667
9322,0.622458,0.619607,0.587724,0.597814,0.549957,0.625667
11413,0.562835,0.563453,0.604019,0.56146,0.539789,0.574
6746,0.633794,0.63094,0.600488,0.60448,0.546782,0.637


In [50]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/santeModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63425226,  0.64273674,  0.62245792,  0.56283476,  0.63379432]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.55639553,  0.58477657,  0.54995735,  0.53978909,  0.54678191]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.71269094,  0.64060031,  0.5877242 ,  0.6040193 ,  0.60048801]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.6205627 ,  0.64052699,  0.59781433,  0.56145965,  0.60448013]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62

In [51]:
RMSE

{'ElasticNet': [0.048439757351696131, 8.2215967821287084],
 'Lasso': [0.048506383027202178, 8.2329050435506357],
 'LinearSVR': [0.053311148653203064, 9.0484096573083317],
 'Ridge': [0.051086773887118013, 8.6708703503643676],
 'SVR': [0.069900647273839767, 11.864116752748179]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [52]:
CVScore={}
for reg in santeModel.predictiveModels.keys():
    scores,Sstd,Smean=santeModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/santeModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [53]:
CVScore

{'ElasticNet': [0.069957893084195713,
  11.873832985775435,
  0.0064955155045944872],
 'Lasso': [0.069636656330641075, 11.819310023570015, 0.0065339691534177947],
 'LinearSVR': [0.07373834926024557, 12.515483316645678, 0.0097308291592782675],
 'Ridge': [0.065243038982058466, 11.073588900469739, 0.0071383006728380988],
 'SVR': [0.071598311071310761, 12.152258312027383, 0.0051984998620767837]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [54]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/santeModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.64710444,  0.59031446,  0.59147763,  0.60930773,  0.57896438]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.54720662,  0.54770263,  0.55064107,  0.55161345,  0.54692344]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.63506558,  0.63154406,  0.59812308,  0.64795774,  0.54048096]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.61248734,  0.58607667,  0.58714316,  0.59857347,  0.57376314]))
('Labels:\t\t', [0.65099998474100007, 0.51899999618500003, 0.62200000762899998, 0.61400001525900005, 0.55000000000000004])
('Predictions:\t', array([ 0.6442474 ,  0.59141372,  0.59097368,  0.60862079,  0.57945642]))
('Labels:\t\t', [0.65099998474100007, 0.518

In [55]:
RMSE

{'ElasticNet': [0.065538648817479186, 11.12376224989457],
 'Lasso': [0.065266413026714515, 11.077556136906004],
 'LinearSVR': [0.069746792001455005, 11.838003161733097],
 'Ridge': [0.062882899840641562, 10.673006539382214],
 'SVR': [0.067167938056348866, 11.40029871282916]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [56]:
finalPred,finalRMSE=santeModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [57]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7417,0.634252,0.633946,0.620563,0.585333,0.634718
8372,0.642737,0.64182,0.640527,0.645667,0.643669
9322,0.622458,0.619607,0.597814,0.625667,0.625672
11413,0.562835,0.563453,0.56146,0.574,0.562238
6746,0.633794,0.63094,0.60448,0.637,0.637066


In [58]:
finalRMSE*100/santeModel.dataset_bin.CGPA.mean()

8.2172609209070302

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de 8% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [194]:
predictionTest=santeModel.predictTest()

In [195]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4895,0.644247,0.644247,0.635066,0.612487,0.547207,0.651


In [196]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [197]:
rmseFinalTest*100/santeModel.Y_train.mean()

9.9625467598016613

Nous avons un resulat de 9.9 % sur notre ensemble d'evaluation wouhhhhh........

In [59]:
saveModels(self=santeModel,departement='Sante')

NameError: name 'saveModels' is not defined

##### 3. Facuté de Psycologie

In [63]:
psyco=datasetCGPA.loc[datasetCGPA.FAC=='FPSE']

In [64]:
psyco.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11347,55.0,mululusake,pedagogie,FPSE,46.0,G,A,G
7026,60.0,butembo,pedagogie,FPSE,48.950001,G,D,G
8487,57.0,butembo,pedagogie,FPSE,59.6,G,G,F
10434,59.0,butembo,pedagogie,FPSE,59.700001,G,G,A
7487,52.0,chemchem,coupe couture,FPSE,55.15,G,D,G


In [68]:
psyco.EchecRatio.value_counts()

G    180
A     30
D     12
E      5
Name: EchecRatio, dtype: int64

In [67]:
psyco.EchecRatio.loc[psyco.EchecRatio.isin(['E','F','C'])]='E'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
psycoModel=PredictiveModelByilding(dataset=psyco,encoderFunction=ConvertCat)

In [78]:
psycoModel.scale(['DIPPERC','CGPA'])

In [79]:
trainDes,tesDes=psycoModel.split()

In [219]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,181.0,181.0
mean,0.561264,0.597438
std,0.056302,0.079608
min,0.5,0.4
25%,0.52,0.5595
50%,0.55,0.609
75%,0.59,0.652667
max,0.77,0.78175


In [220]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,46.0,46.0
mean,0.565,0.588315
std,0.058224,0.066295
min,0.5,0.43
25%,0.51,0.5645
50%,0.56,0.601
75%,0.6,0.627
max,0.75,0.714


In [80]:
predictedValues=psycoModel.train()

[LibSVM]

In [81]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7479,0.625734,0.624857,0.6024,0.607064,0.575828,0.626667
8518,0.705881,0.705573,0.717894,0.682846,0.627241,0.78175
12160,0.637071,0.63619,0.621704,0.615343,0.576922,0.638
9227,0.57967,0.579921,0.589627,0.572593,0.582669,0.588
8997,0.590432,0.588716,0.518295,0.573732,0.584345,0.592333


In [85]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=santeModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63425226,  0.64273674,  0.62245792,  0.56283476,  0.63379432]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.55639553,  0.58477657,  0.54995735,  0.53978909,  0.54678191]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.71269094,  0.64060031,  0.5877242 ,  0.6040193 ,  0.60048801]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.6205627 ,  0.64052699,  0.59781433,  0.56145965,  0.60448013]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62566666921000003, 0.57400001525900002, 0.637000007629])
('Predictions:\t', array([ 0.63394575,  0.64181988,  0.61960702,  0.56345318,  0.63094036]))
('Labels:\t\t', [0.58533334096300005, 0.64566666921000004, 0.62

In [84]:
psycoModel.dataset_bin.shape

(227, 119)

In [86]:
RMSE

{'ElasticNet': [0.048439757351696131, 8.1330817865213696],
 'Lasso': [0.048506383027202178, 8.1442683014338755],
 'LinearSVR': [0.053311148653203064, 8.9509930650946377],
 'Ridge': [0.051086773887118013, 8.5775184053209639],
 'SVR': [0.069900647273839767, 11.73638581798169]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [89]:
CVScore={}
for reg in santeModel.predictiveModels.keys():
    scores,Sstd,Smean=psycoModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/psycoModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [90]:
CVScore

{'ElasticNet': [0.078204129700068264, 13.13054849297483, 0.02092902610277643],
 'Lasso': [0.078232031526185034, 13.135233236379991, 0.02078979931596725],
 'LinearSVR': [0.082756950438959476, 13.894971468092733, 0.019705822213195171],
 'Ridge': [0.073661872239828508, 12.367899102488707, 0.018291111019811956],
 'SVR': [0.076697968122337909, 12.877662517381312, 0.014078229175069429]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [91]:
RMSE={}
for reg in santeModel.predictiveModels.keys():
    rmse=psycoModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/psycoModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.54583656,  0.68797328,  0.62628417,  0.62787676,  0.66826284]))
('Labels:\t\t', [0.46999999999999997, 0.62700000762899999, 0.45000000000000001, 0.68950000762899999, 0.678999977112])
('Predictions:\t', array([ 0.57564531,  0.60677534,  0.57728656,  0.5781989 ,  0.58468178]))
('Labels:\t\t', [0.46999999999999997, 0.62700000762899999, 0.45000000000000001, 0.68950000762899999, 0.678999977112])
('Predictions:\t', array([ 0.54150096,  0.74445484,  0.63106179,  0.63036044,  0.71429797]))
('Labels:\t\t', [0.46999999999999997, 0.62700000762899999, 0.45000000000000001, 0.68950000762899999, 0.678999977112])
('Predictions:\t', array([ 0.56742404,  0.65557334,  0.61672768,  0.59878354,  0.64105178]))
('Labels:\t\t', [0.46999999999999997, 0.62700000762899999, 0.45000000000000001, 0.68950000762899999, 0.678999977112])
('Predictions:\t', array([ 0.54672994,  0.6876375 ,  0.62602265,  0.62765028,  0.66789843]))
('Labels:\t\t', [0.46999999999999997, 0.62700000762899999, 0.45

In [92]:
RMSE

{'ElasticNet': [0.071366149384552643, 11.982445030008899],
 'Lasso': [0.071280682322226147, 11.968095028151781],
 'LinearSVR': [0.076990256247138913, 12.926737974282878],
 'Ridge': [0.0666627522405914, 11.192740131848661],
 'SVR': [0.066724727805604289, 11.20314588874286]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [230]:
finalPred,finalRMSE=santeModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue',u'SVR']])

In [231]:
finalPred

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,SVR,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7044,0.614691,0.614691,0.601469,0.616500,0.576164,0.615258
8518,0.708552,0.708552,0.683582,0.781750,0.627838,0.709026
12160,0.636192,0.636192,0.616022,0.638000,0.577707,0.637065
9227,0.578178,0.578178,0.572074,0.588000,0.583145,0.578105
8997,0.592350,0.592350,0.587194,0.592333,0.584850,0.592344
5031,0.592998,0.592998,0.592967,0.648000,0.583145,0.592927
9826,0.651616,0.651616,0.635800,0.634500,0.584137,0.652386
8578,0.687691,0.687691,0.645101,0.689500,0.589443,0.689239
7004,0.581252,0.581252,0.583207,0.607000,0.576357,0.581189
9947,0.681999,0.681999,0.673437,0.693000,0.620269,0.682056


In [232]:
finalRMSE*100/psycoModel.dataset_bin.CGPA.mean()

6.6010010786396887

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de 6% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [233]:
predictionTest=psycoModel.predictTest()

In [234]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9149,0.546273,0.546273,0.541992,0.567743,0.576357,0.47


In [235]:
finalPredictTes,rmseFinalTest=ensembelMethods(self=droitModel,predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [236]:
rmseFinalTest*100/psycoModel.Y_train.mean()

10.564151678218369

Nous avons un resulat de 10.5 % sur notre ensemble d'evaluation wouhhhhh........

 10.56

Efin atterisson avec la afaculté de theologie

##### 3. Facuté de Theologie

In [93]:
teologie=datasetCGPA.loc[datasetCGPA.FAC=='FT']

In [239]:
teologie.shape

(140, 8)

In [238]:
teologie.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8692,52.0,mululusake,pedagogie,FT,59.933333,G,G,G
3651,58.0,ibanda,commmerciale et adm,FT,63.400002,G,G,G
11729,60.0,butembo,sociale,FT,64.400002,G,G,G
7512,52.0,ndosho,pedagogie,FT,66.875,F,G,B
10686,52.0,ndosho,pedagogie,FT,57.1,G,G,G


In [95]:
teologie.EchecRatio.value_counts()

G    119
A     14
E      7
Name: EchecRatio, dtype: int64

In [94]:
teologie.EchecRatio.loc[teologie.EchecRatio.isin(['E','D','F'])]='E'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [96]:
teologieModel=PredictiveModelByilding(dataset=teologie,encoderFunction=ConvertCat)

In [97]:
teologieModel.scale(['DIPPERC','CGPA'])

In [98]:
trainDes,tesDes=teologieModel.split()

In [248]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,112.0,112.0
mean,0.5375,0.618471
std,0.039421,0.062725
min,0.5,0.41
25%,0.51,0.5915
50%,0.52,0.6205
75%,0.5525,0.66
max,0.65,0.7325


In [249]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,28.0,28.0
mean,0.540714,0.626101
std,0.043879,0.061257
min,0.5,0.43
25%,0.51,0.6065
50%,0.52,0.6335
75%,0.57,0.65425
max,0.69,0.734


In [99]:
predictedValues=teologieModel.train()

[LibSVM]

In [100]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10903,0.590577,0.59112,0.59291,0.598864,0.57806,0.59
12230,0.490617,0.49112,0.497404,0.546722,0.590417,0.49
8319,0.622924,0.622738,0.598529,0.6188,0.577988,0.604667
8692,0.599905,0.600453,0.594256,0.603376,0.578024,0.599333
11528,0.678408,0.677881,0.65485,0.643442,0.579455,0.679


In [103]:
RMSE={}
for reg in teologieModel.predictiveModels.keys():
    rmse=teologieModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/teologieModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.59057674,  0.49061699,  0.62292402,  0.59990485,  0.67840783]))
('Labels:\t\t', [0.58999999999999997, 0.48999999999999999, 0.60466667175299993, 0.59933333078999995, 0.679000015259])
('Predictions:\t', array([ 0.57805965,  0.59041739,  0.57798846,  0.57802397,  0.57945513]))
('Labels:\t\t', [0.58999999999999997, 0.48999999999999999, 0.60466667175299993, 0.59933333078999995, 0.679000015259])
('Predictions:\t', array([ 0.59291044,  0.4974042 ,  0.59852884,  0.59425624,  0.65484953]))
('Labels:\t\t', [0.58999999999999997, 0.48999999999999999, 0.60466667175299993, 0.59933333078999995, 0.679000015259])
('Predictions:\t', array([ 0.59886449,  0.54672224,  0.61879951,  0.6033758 ,  0.64344218]))
('Labels:\t\t', [0.58999999999999997, 0.48999999999999999, 0.60466667175299993, 0.59933333078999995, 0.679000015259])
('Predictions:\t', array([ 0.59112026,  0.49112036,  0.62273807,  0.60045348,  0.67788066]))
('Labels:\t\t', [0.58999999999999997, 0.48999999999999999, 0.60

In [104]:
RMSE

{'ElasticNet': [0.033030139039615068, 5.3274673347465784],
 'Lasso': [0.033044219538058778, 5.3297383937762071],
 'LinearSVR': [0.041773813098895503, 6.7377441089563179],
 'Ridge': [0.040647315585278128, 6.5560500900703547],
 'SVR': [0.064972754711870487, 10.479526833382224]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [105]:
CVScore={}
for reg in teologieModel.predictiveModels.keys():
    scores,Sstd,Smean=teologieModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/teologieModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [106]:
CVScore

{'ElasticNet': [0.077063249532144065,
  12.429615997060198,
  0.021473799986856463],
 'Lasso': [0.07673273875100918, 12.376307550850939, 0.021654585959023746],
 'LinearSVR': [0.085889507994066028, 13.853212900104536, 0.015968731483475579],
 'Ridge': [0.064720986752121498, 10.438918903155535, 0.019820937006498097],
 'SVR': [0.067846434412392451, 10.943025782529654, 0.013916433505527667]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [117]:
RMSE={}
for reg in teologieModel.predictiveModels.keys():
    rmse=teologieModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/teologieModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.62755466,  0.62039939,  0.62039939,  0.62039939,  0.58957758]))
('Labels:\t\t', [0.70250000000000001, 0.59499999999999997, 0.62099998474099993, 0.64533331553099993, 0.58999999999999997])
('Predictions:\t', array([ 0.57811348,  0.57813151,  0.57805965,  0.57805965,  0.57800619]))
('Labels:\t\t', [0.70250000000000001, 0.59499999999999997, 0.62099998474099993, 0.64533331553099993, 0.58999999999999997])
('Predictions:\t', array([ 0.6513134 ,  0.62833404,  0.59875626,  0.59875626,  0.56511077]))
('Labels:\t\t', [0.70250000000000001, 0.59499999999999997, 0.62099998474099993, 0.64533331553099993, 0.58999999999999997])
('Predictions:\t', array([ 0.62622815,  0.60835042,  0.60772898,  0.60772898,  0.59782074]))
('Labels:\t\t', [0.70250000000000001, 0.59499999999999997, 0.62099998474099993, 0.64533331553099993, 0.58999999999999997])
('Predictions:\t', array([ 0.62743484,  0.62079283,  0.62079283,  0.62079283,  0.59012064]))
('Labels:\t\t', [0.70250000000000001, 0.594

In [118]:
RMSE

{'ElasticNet': [0.068947746628582329, 11.120657637184339],
 'Lasso': [0.0689052663800028, 11.113805951322286],
 'LinearSVR': [0.086873511363432174, 14.01192388225606],
 'Ridge': [0.062732357638262903, 10.118170733361238],
 'SVR': [0.074887757113489439, 12.078728439987826]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [109]:
finalPred,finalRMSE=teologieModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [110]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10903,0.590577,0.59112,0.598864,0.59,0.589999
12230,0.490617,0.49112,0.546722,0.49,0.489983
8319,0.622924,0.622738,0.6188,0.604667,0.623132
8692,0.599905,0.600453,0.603376,0.599333,0.599332
11528,0.678408,0.677881,0.643442,0.679,0.679047


In [111]:
finalRMSE*100/teologieModel.dataset_bin.CGPA.mean()

5.3265910406616088

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de R% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [112]:
predictionTest=teologieModel.predictTest()

In [113]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6887,0.627555,0.627435,0.651313,0.626228,0.578113,0.7025


In [115]:
finalPredictTes,rmseFinalTest=teologieModel.ensembelMethods(predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [116]:
rmseFinalTest*100/teologieModel.Y_test.mean()

9.0930863089146161

Nous avons un resulat de 10.11 % sur notre ensemble d'evaluation wouhhhhh........

 10.56

In [121]:
teologieModel.saveModels(departement='Theologie')
#droitModel.saveModels(departement='Droit')
#santeModel.saveModels(departement='Sante')
psycoModel.saveModels(departement='Psycologie')

Passons encore à la Faculté de De Medecine

##### Faculté de Medecine

In [262]:
medecine=datasetCGPA.loc[datasetCGPA.FAC=='FM']

In [7]:
medecine.shape

(242, 8)

In [8]:
medecine.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10384,59.0,zanner,latin philo,FM,59.15,G,G,G
11610,52.0,zanner,latin philo,FM,44.0,G,A,G
10159,56.876522,iti gombe,vétérinaire,FM,58.049999,G,G,G
11776,51.0,masisi,latin philo,FM,42.0,G,A,G
10411,57.0,ndosho,pedagogie,FM,49.0,G,A,G


In [125]:
medecine.EchecRatio.value_counts()

G    172
A     49
D     21
Name: EchecRatio, dtype: int64

In [263]:
medecineModel=PredictiveModelByilding(dataset=medecine,encoderFunction=ConvertCat)

In [264]:
medecineModel.scale(['DIPPERC','CGPA'])

In [265]:
trainDes,tesDes=medecineModel.split()

In [266]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,193.0,193.0
mean,0.594774,0.577718
std,0.072775,0.072102
min,0.5,0.4
25%,0.53,0.547
50%,0.58,0.5925
75%,0.63,0.622
max,0.83,0.74


In [21]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,49.0,49.0
mean,0.592653,0.572949
std,0.075105,0.059417
min,0.5,0.4
25%,0.54,0.551
50%,0.58,0.5845
75%,0.62,0.61
max,0.84,0.6985


In [267]:
predictedValues=medecineModel.train()

[LibSVM]

In [23]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10395,0.618516,0.61757,0.600756,0.599447,0.56428,0.6195
10437,0.632568,0.632735,0.633875,0.630198,0.58689,0.6075
10415,0.609239,0.608485,0.604709,0.601404,0.545157,0.5985
10519,0.597,0.596068,0.581114,0.579572,0.544659,0.598
10906,0.539998,0.540929,0.533855,0.554949,0.562773,0.539


In [268]:
RMSE={}
for reg in medecineModel.predictiveModels.keys():
    rmse=medecineModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/medecineModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.61851645,  0.63256827,  0.60923878,  0.59700048,  0.53999797]))
('Labels:\t\t', [0.61950000762899993, 0.60750000000000004, 0.59849998474099997, 0.59800001144399995, 0.53899999618500005])
('Predictions:\t', array([ 0.56427976,  0.58688952,  0.54515706,  0.54465932,  0.5627733 ]))
('Labels:\t\t', [0.61950000762899993, 0.60750000000000004, 0.59849998474099997, 0.59800001144399995, 0.53899999618500005])
('Predictions:\t', array([ 0.60075631,  0.6338749 ,  0.60470872,  0.58111407,  0.53385469]))
('Labels:\t\t', [0.61950000762899993, 0.60750000000000004, 0.59849998474099997, 0.59800001144399995, 0.53899999618500005])
('Predictions:\t', array([ 0.58867873,  0.63617338,  0.60059779,  0.57354025,  0.52833967]))
('Labels:\t\t', [0.61950000762899993, 0.60750000000000004, 0.59849998474099997, 0.59800001144399995, 0.53899999618500005])
('Predictions:\t', array([ 0.61758798,  0.63248212,  0.6096308 ,  0.5960883 ,  0.54094611]))
('Labels:\t\t', [0.61950000762899993, 0.607

In [269]:
RMSE

{'ElasticNet': [0.040642792877699799, 7.0468395828451929],
 'Lasso': [0.040670039857082181, 7.0515637929505974],
 'LinearSVR': [0.044471700859937138, 7.7107137513722899],
 'Ridge': [0.047916025938842581, 8.3079071178629764],
 'SVR': [0.067880784974444108, 11.769491430176245]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [25]:
CVScore={}
for reg in medecineModel.predictiveModels.keys():
    scores,Sstd,Smean=medecineModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/medecineModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [26]:
CVScore

{'ElasticNet': [0.064110091766669194,
  11.11571081447714,
  0.0089773130582696571],
 'Lasso': [0.064084098058802538, 11.111203902512431, 0.0089165004807136242],
 'LinearSVR': [0.066225937273730509, 11.482566111927079, 0.011329086979526115],
 'Ridge': [0.063693369762372692, 11.043457583166099, 0.010033005792933954],
 'SVR': [0.071259822458049252, 12.355364924722714, 0.0084853917420444416]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [27]:
RMSE={}
for reg in medecineModel.predictiveModels.keys():
    rmse=medecineModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/medecineModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.55173163,  0.5640818 ,  0.53376791,  0.57022842,  0.56122689]))
('Labels:\t\t', [0.54299999237100005, 0.58950000762900001, 0.5264999961850001, 0.58599998474100001, 0.48899999618500006])
('Predictions:\t', array([ 0.56352663,  0.56427976,  0.5429176 ,  0.53439531,  0.56428356]))
('Labels:\t\t', [0.54299999237100005, 0.58950000762900001, 0.5264999961850001, 0.58599998474100001, 0.48899999618500006])
('Predictions:\t', array([ 0.5516498 ,  0.5715292 ,  0.50074944,  0.58134997,  0.56582115]))
('Labels:\t\t', [0.54299999237100005, 0.58950000762900001, 0.5264999961850001, 0.58599998474100001, 0.48899999618500006])
('Predictions:\t', array([ 0.56023939,  0.56448792,  0.55123042,  0.55473708,  0.59448046]))
('Labels:\t\t', [0.54299999237100005, 0.58950000762900001, 0.5264999961850001, 0.58599998474100001, 0.48899999618500006])
('Predictions:\t', array([ 0.55263311,  0.56493969,  0.53482569,  0.57038552,  0.56228165]))
('Labels:\t\t', [0.54299999237100005, 0.5895000

In [138]:
RMSE

{'ElasticNet': [0.069053310809881224, 11.972789503070258],
 'Lasso': [0.06885336607263759, 11.93812213921488],
 'LinearSVR': [0.078396453860313006, 13.59274781828961],
 'Ridge': [0.066020981080969224, 11.447029838220555],
 'SVR': [0.061979763423994388, 10.746344414515617]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [270]:
finalPred,finalRMSE=medecineModel.ensembelMethods(predictedValues=predictedValues[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [271]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,Ridge,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10395,0.618516,0.617588,0.588679,0.6195,0.618991
10437,0.632568,0.632482,0.636173,0.6075,0.632605
10415,0.609239,0.609631,0.600598,0.5985,0.609496
10519,0.597,0.596088,0.57354,0.598,0.597258
10906,0.539998,0.540946,0.52834,0.539,0.540105


In [31]:
finalRMSE*100/medecineModel.dataset_bin.CGPA.mean()

7.0457982743938787

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de R% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [32]:
predictionTest=medecineModel.predictTest()

In [33]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10825,0.551732,0.552633,0.55165,0.560239,0.563527,0.543


In [34]:
finalPredictTes,rmseFinalTest=medecineModel.ensembelMethods(predictedValues=predictionTest[[u'ElasticNet', u'Lasso', u'Ridge',  u'RealValue']])

In [35]:
rmseFinalTest*100/medecineModel.Y_test.mean()

9.9510545697527917

Nous avons un resulat de % sur notre ensemble d'evaluation wouhhhhh........

 10.56

esayons en fin d'anlayser les coefficients de nos modèles de regressions lineares

In [37]:
ridgeRgressorMed=medecineModel.predictiveModels['Ridge']

In [46]:
sorted(zip(np.abs(ridgeRgressorMed.coef_), medecineModel.dataset_bin.columns), reverse=True)

[(0.14161788542203865, 'SCHOOL_RIGHT-isoko'),
 (0.091877489723899877, 'SCHOOL_RIGHT-kalimba'),
 (0.078453396863903554, 'SCHOOL_RIGHT-idap isp rutshuru'),
 (0.073460928661847252, 'SCHOOL_RIGHT-mabula'),
 (0.068542825522317247, 'SCHOOL_RIGHT-icl'),
 (0.066529782475518093, 'SCHOOL_RIGHT-maendeleo'),
 (0.06446105410930969, 'SCHOOL_RIGHT-bimenya'),
 (0.064199033529377683, 'SCHOOL_RIGHT-matanda'),
 (0.061628696400868835, 'SCHOOL_RIGHT-ngoma'),
 (0.057336785827979041, 'SCHOOL_RIGHT-bsangani'),
 (0.05268263861021346, 'SCHOOL_RIGHT-saint michel'),
 (0.050169143536419838, 'SCHOOL_RIGHT-sainte ursule'),
 (0.048531387423318535, 'SCHOOL_RIGHT-Mwangaza'),
 (0.044387699232079497, 'SCHOOL_RIGHT-totoro'),
 (0.043107499172760903, 'SCHOOL_RIGHT-anuarite'),
 (0.041154827746480514, 'SCHOOL_RIGHT-fadhili'),
 (0.041094449063746014, 'OPTION_RIGHT-secretariat'),
 (0.041011975142910176, 'SCHOOL_RIGHT-r\xc3\xa9v\xc3\xa9rend samba'),
 (0.040615212383598033, 'SCHOOL_RIGHT-mwanga'),
 (0.039348101671197164, 'SCHOOL_

In [47]:
medecine.loc[medecine.SCHOOL_RIGHT=='sainte ursule']

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9879,62.0,sainte ursule,sociale,FM,61.549999,G,G,D
9891,50.0,sainte ursule,sociale,FM,55.9,G,G,G
10250,55.0,sainte ursule,nutr,FM,57.6,G,G,G
10290,66.0,sainte ursule,nutr,FM,60.15,G,G,A
10324,58.0,sainte ursule,nutr,FM,55.25,G,D,G
10481,57.0,sainte ursule,sociale,FM,59.15,G,G,G
10535,58.0,sainte ursule,sociale,FM,61.550001,G,G,D
10538,53.0,sainte ursule,nutr,FM,61.1,G,G,G
10550,54.0,sainte ursule,sociale,FM,47.0,G,A,G
10897,60.0,sainte ursule,nutr,FM,50.85,G,D,G


In [48]:
predictedValues.loc[medecine.loc[medecine.SCHOOL_RIGHT=='sainte ursule'].index] + 

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9879,0.617962,0.617356,0.625193,0.586703,0.530568,0.6155
9891,0.568561,0.56813,0.545675,0.569709,0.527591,0.559
10250,,,,,,
10290,,,,,,
10324,0.541302,0.54191,0.53579,0.567813,0.545578,0.5525
10481,0.597378,0.596845,0.59206,0.579622,0.529327,0.5915
10535,0.601495,0.600947,0.598687,0.581038,0.529575,0.6155
10538,,,,,,
10550,,,,,,
10897,0.549536,0.550114,0.549043,0.570646,0.546074,0.5085


##### Faculté de Technologie

In [200]:
technologie=datasetCGPA.loc[datasetCGPA.FAC=='FSTA']

In [130]:
technologie.corr()

Unnamed: 0,DIPPERC,CGPA
DIPPERC,1.0,0.346685
CGPA,0.346685,1.0


In [33]:
technologie.shape

(903, 8)

In [51]:
technologie.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10124,50.0,zanner,commmerciale et adm,FSTA,47.0,G,A,G
10560,51.0,zanner,latin philo,FSTA,42.0,G,A,G
11697,52.0,zanner,commmerciale et adm,FSTA,45.099998,G,A,G
3289,61.0,edap/isp,math-physique,FSTA,47.0,G,A,G
3691,59.0,edap/isp,commmerciale et adm,FSTA,64.300001,G,G,G


In [34]:
technologie.EchecRatio.value_counts()

G    459
A    289
D     66
E     52
C     34
F      3
Name: EchecRatio, dtype: int64

In [212]:
techModel=PredictiveModelByilding(dataset=technologie,encoderFunction=ConvertCat)

In [213]:
type(techModel.encoders)

dict

In [214]:
techModel.scale(['DIPPERC','CGPA'])

In [215]:
trainDes,tesDes=techModel.split()

In [248]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,112.0,112.0
mean,0.5375,0.618471
std,0.039421,0.062725
min,0.5,0.41
25%,0.51,0.5915
50%,0.52,0.6205
75%,0.5525,0.66
max,0.65,0.7325


In [249]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,28.0,28.0
mean,0.540714,0.626101
std,0.043879,0.061257
min,0.5,0.43
25%,0.51,0.6065
50%,0.52,0.6335
75%,0.57,0.65425
max,0.69,0.734


In [216]:
predictedValues=techModel.train()

[LibSVM]

In [217]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9486,0.634815,0.636905,0.647048,0.649259,0.577358,0.644333
11485,0.59969,0.598085,0.589958,0.587566,0.572318,0.629
12161,0.569621,0.568737,0.565688,0.567193,0.556174,0.566
7530,0.609765,0.612045,0.622319,0.62417,0.562753,0.517
9959,0.688854,0.687146,0.656663,0.636091,0.563066,0.663


In [218]:
RMSE={}
for reg in techModel.predictiveModels.keys():
    rmse=techModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/techModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.63481512,  0.59968978,  0.56962066,  0.60976482,  0.68885419]))
('Labels:\t\t', [0.64433333078999988, 0.62900001525900007, 0.565999984741, 0.51699999809300001, 0.66300003051800005])
('Predictions:\t', array([ 0.57735789,  0.57231847,  0.55617352,  0.56275306,  0.56306622]))
('Labels:\t\t', [0.64433333078999988, 0.62900001525900007, 0.565999984741, 0.51699999809300001, 0.66300003051800005])
('Predictions:\t', array([ 0.64704777,  0.58995805,  0.56568758,  0.62231925,  0.65666288]))
('Labels:\t\t', [0.64433333078999988, 0.62900001525900007, 0.565999984741, 0.51699999809300001, 0.66300003051800005])
('Predictions:\t', array([ 0.6492594 ,  0.58756614,  0.56719251,  0.62417026,  0.63609105]))
('Labels:\t\t', [0.64433333078999988, 0.62900001525900007, 0.565999984741, 0.51699999809300001, 0.66300003051800005])
('Predictions:\t', array([ 0.63690525,  0.59808495,  0.56873684,  0.61204493,  0.6871458 ]))
('Labels:\t\t', [0.64433333078999988, 0.62900001525900007, 0.56

In [180]:
techModel.dataset_bin.shape

(903, 240)

In [219]:
RMSE

{'ElasticNet': [0.061952060117206845, 11.178710379171463],
 'Lasso': [0.062044077906170544, 11.19531418880444],
 'LinearSVR': [0.064208474475398264, 11.585860723452269],
 'Ridge': [0.065947807694384991, 11.899708273818463],
 'SVR': [0.077191883495771912, 13.928603949399779]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [159]:
CVScore={}
for reg in techModel.predictiveModels.keys():
    scores,Sstd,Smean=techModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/techModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [160]:
CVScore

{'ElasticNet': [0.080614181083205991,
  14.546127781357663,
  0.006798679897684492],
 'Lasso': [0.080337423403767885, 14.496189265385846, 0.0067501931079835724],
 'LinearSVR': [0.081768695928976246,
  14.754449943121173,
  0.0060599790765970161],
 'Ridge': [0.07732886997795145, 13.953321968578633, 0.0066876799718420064],
 'SVR': [0.079899404867756249, 14.41715263051819, 0.0053812025523476371]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [161]:
RMSE={}
for reg in techModel.predictiveModels.keys():
    rmse=techModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/techModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.39638595,  0.53117382,  0.68525706,  0.59829498,  0.57548309]))
('Labels:\t\t', [0.46999999999999997, 0.40999999999999998, 0.67150001525899994, 0.64800003051800004, 0.48999999999999999])
('Predictions:\t', array([ 0.51200495,  0.54524049,  0.62750567,  0.57136649,  0.57215634]))
('Labels:\t\t', [0.46999999999999997, 0.40999999999999998, 0.67150001525899994, 0.64800003051800004, 0.48999999999999999])
('Predictions:\t', array([ 0.40266541,  0.52672515,  0.68320047,  0.59397141,  0.56617559]))
('Labels:\t\t', [0.46999999999999997, 0.40999999999999998, 0.67150001525899994, 0.64800003051800004, 0.48999999999999999])
('Predictions:\t', array([ 0.43734936,  0.55229027,  0.68046839,  0.59775067,  0.58092574]))
('Labels:\t\t', [0.46999999999999997, 0.40999999999999998, 0.67150001525899994, 0.64800003051800004, 0.48999999999999999])
('Predictions:\t', array([ 0.39972731,  0.53472928,  0.6848059 ,  0.59802918,  0.57537606]))
('Labels:\t\t', [0.46999999999999997, 0.409

In [162]:
RMSE

{'ElasticNet': [0.07878055492562179, 14.215265890396694],
 'Lasso': [0.078458853278539215, 14.157217626393287],
 'LinearSVR': [0.07745361711389151, 13.975831504188374],
 'Ridge': [0.07667625412410424, 13.835563114309517],
 'SVR': [0.077260716153879569, 13.941024203837779]}

In [58]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [237]:
finalPred,finalRMSE=medecineModel.ensembelMethods(predictedValues=predictedValues)

NameError: name 'medecineModel' is not defined

In [60]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9486,0.634815,0.634622,0.647048,0.626841,0.577358,0.644333,0.63503
11485,0.59969,0.599323,0.589958,0.599942,0.572318,0.629,0.600011
12161,0.569621,0.57293,0.565688,0.565653,0.556174,0.566,0.566193
7530,0.609765,0.609463,0.622319,0.600205,0.562753,0.517,0.610128
9959,0.688854,0.687458,0.656663,0.645131,0.563066,0.663,0.690692


In [167]:
finalRMSE*100/techModel.dataset_bin.CGPA.mean()

11.174584576648359

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de R% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [171]:
predictionTest=techModel.predictTest()

In [169]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10825,0.551732,0.552633,0.55165,0.560239,0.563527,0.543


In [236]:
finalPredictTes,rmseFinalTest=techModel.ensembelMethods(predictedValues=predictionTest)

NameError: name 'predictionTest' is not defined

In [173]:
rmseFinalTest*100/techModel.Y_test.mean()

12.863755030838947

Nous avons un resulat de % sur notre ensemble d'evaluation wouhhhhh........

#### Faculté d'économie

In [185]:
economie = datasetCGPA.loc[datasetCGPA.FAC=='FSEG']

In [186]:
economie.head(5)

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3895,52.0,zanner,commmerciale et adm,FSEG,59.400002,G,G,G
4048,53.0,zanner,commmerciale et adm,FSEG,40.0,G,A,G
4217,54.0,zanner,commmerciale et adm,FSEG,61.299999,G,G,A
4347,53.0,zanner,commmerciale et adm,FSEG,60.599998,G,G,G
4409,58.0,zanner,commmerciale et adm,FSEG,48.599998,G,A,G


In [187]:
economieModel=PredictiveModelByilding(dataset=economie,encoderFunction=ConvertCat)

In [188]:
economieModel.scale(['DIPPERC','CGPA'])

In [189]:
trainDes,tesDes=economieModel.split()

In [248]:
trainDes

Unnamed: 0,DIPPERC,CGPA
count,112.0,112.0
mean,0.5375,0.618471
std,0.039421,0.062725
min,0.5,0.41
25%,0.51,0.5915
50%,0.52,0.6205
75%,0.5525,0.66
max,0.65,0.7325


In [249]:
tesDes

Unnamed: 0,DIPPERC,CGPA
count,28.0,28.0
mean,0.540714,0.626101
std,0.043879,0.061257
min,0.5,0.43
25%,0.51,0.6065
50%,0.52,0.6335
75%,0.57,0.65425
max,0.69,0.734


In [190]:
predictedValues=economieModel.train()

[LibSVM]

In [191]:
predictedValues.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3907,0.555112,0.554611,0.544067,0.557017,0.550573,0.486
9700,0.602081,0.60006,0.599212,0.594155,0.545582,0.619
9098,0.61179,0.605943,0.585577,0.584145,0.544026,0.618333
11606,0.582595,0.582465,0.576753,0.583322,0.544791,0.596
7145,0.604056,0.603877,0.607617,0.603269,0.553744,0.623333


In [192]:
RMSE={}
for reg in economieModel.predictiveModels.keys():
    rmse=economieModel.evaluate(model=reg,on='train') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/economieModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.55511222,  0.60208057,  0.61179008,  0.58259483,  0.60405612]))
('Labels:\t\t', [0.48599998474099998, 0.61899999618500001, 0.618333333333, 0.59599998474100002, 0.62333333333300001])
('Predictions:\t', array([ 0.55057317,  0.54558226,  0.54402584,  0.54479125,  0.5537442 ]))
('Labels:\t\t', [0.48599998474099998, 0.61899999618500001, 0.618333333333, 0.59599998474100002, 0.62333333333300001])
('Predictions:\t', array([ 0.54406702,  0.59921186,  0.58557652,  0.57675313,  0.60761677]))
('Labels:\t\t', [0.48599998474099998, 0.61899999618500001, 0.618333333333, 0.59599998474100002, 0.62333333333300001])
('Predictions:\t', array([ 0.55701682,  0.59415512,  0.58414456,  0.58332202,  0.60326932]))
('Labels:\t\t', [0.48599998474099998, 0.61899999618500001, 0.618333333333, 0.59599998474100002, 0.62333333333300001])
('Predictions:\t', array([ 0.55461081,  0.60006044,  0.60594333,  0.58246539,  0.60387692]))
('Labels:\t\t', [0.48599998474099998, 0.61899999618500001, 0.61

In [193]:
RMSE

{'ElasticNet': [0.047821194298310048, 8.47201039968194],
 'Lasso': [0.047958756376958421, 8.4963809194485727],
 'LinearSVR': [0.050122559048785743, 8.8797205454777028],
 'Ridge': [0.049256183881533661, 8.7262333828359875],
 'SVR': [0.061859836390664738, 10.959098469083097]}

on remarque aisement que les valeurs predites par differents regressons disposent d'une bonne exacitude

In [194]:
CVScore={}
for reg in economieModel.predictiveModels.keys():
    scores,Sstd,Smean=economieModel.crossEvaluate(model=reg) #RMSE of each model
    CVScore[reg]=[Smean,Smean*100/economieModel.dataset_bin.CGPA.mean(),Sstd]

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [195]:
CVScore

{'ElasticNet': [0.059152334465975953,
  10.479436996807134,
  0.0046933196576423915],
 'Lasso': [0.0588256959678925, 10.421569668453998, 0.0047359710391706054],
 'LinearSVR': [0.063013253823785004,
  11.163438085271116,
  0.0055082003177155834],
 'Ridge': [0.057376527784788064, 10.16483480228645, 0.0047699422372373427],
 'SVR': [0.063132869888177567, 11.184629286293433, 0.0036022950044768487]}

Aussi on peut remarquer les methodes lineaires disposent des meilleurs resulats

In [198]:
RMSE={}
for reg in economieModel.predictiveModels.keys():
    rmse=economieModel.evaluate(model=reg,on='test') #RMSE of each model
    RMSE[reg]=[rmse,rmse*100/economieModel.dataset_bin.CGPA.mean()]

('Predictions:\t', array([ 0.5640067 ,  0.50060843,  0.53204586,  0.48522947,  0.57370035]))
('Labels:\t\t', [0.64300003051800003, 0.60199998855600012, 0.45000000000000001, 0.48999999999999999, 0.62133332570400002])
('Predictions:\t', array([ 0.55124051,  0.55330167,  0.54054641,  0.52349951,  0.54412378]))
('Labels:\t\t', [0.64300003051800003, 0.60199998855600012, 0.45000000000000001, 0.48999999999999999, 0.62133332570400002])
('Predictions:\t', array([ 0.56048537,  0.50626083,  0.53966336,  0.48181885,  0.56033478]))
('Labels:\t\t', [0.64300003051800003, 0.60199998855600012, 0.45000000000000001, 0.48999999999999999, 0.62133332570400002])
('Predictions:\t', array([ 0.56368283,  0.51892243,  0.53153356,  0.49487057,  0.57665601]))
('Labels:\t\t', [0.64300003051800003, 0.60199998855600012, 0.45000000000000001, 0.48999999999999999, 0.62133332570400002])
('Predictions:\t', array([ 0.56346775,  0.5026404 ,  0.53242426,  0.48705274,  0.57360845]))
('Labels:\t\t', [0.64300003051800003, 0.601

In [199]:
RMSE

{'ElasticNet': [0.059088345046754753, 10.468100621104558],
 'Lasso': [0.058981573592012816, 10.449184973174058],
 'LinearSVR': [0.064710654812651758, 11.464149914850363],
 'Ridge': [0.058797427439363725, 10.416561611436736],
 'SVR': [0.065406921403279125, 11.587500614959387]}

In [167]:
predictedValues.columns

Index([u'ElasticNet', u'Lasso', u'LinearSVR', u'Ridge', u'SVR', u'RealValue'], dtype='object')

In [163]:
finalPred,finalRMSE=medecineModel.ensembelMethods(predictedValues=predictedValues)

In [164]:
finalPred.head(5)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9486,0.634815,0.634622,0.647048,0.626841,0.577358,0.644333,0.63503
11485,0.59969,0.599323,0.589958,0.599942,0.572318,0.629,0.600011
12161,0.569621,0.57293,0.565688,0.565653,0.556174,0.566,0.566193
7530,0.609765,0.609463,0.622319,0.600205,0.562753,0.517,0.610128
9959,0.688854,0.687458,0.656663,0.645131,0.563066,0.663,0.690692


In [167]:
finalRMSE*100/techModel.dataset_bin.CGPA.mean()

11.174584576648359

On remarque que notre ensemble methode sans modele avec SVM nous donne un score de R% en valdiation croisé voyons combien il nous donnerasavec l'ensemble d'evaluation

In [238]:
predictionTest=techModel.predictTest()

In [169]:
predictionTest.head(1)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10825,0.551732,0.552633,0.55165,0.560239,0.563527,0.543


In [239]:
finalPredictTes,rmseFinalTest=techModel.ensembelMethods(predictedValues=predictionTest)

In [173]:
rmseFinalTest*100/techModel.Y_test.mean()

12.863755030838947

#### Analyse des resultats

In [220]:
ridgeRgressorTech=techModel.predictiveModels['Ridge']

voyons dans notre modèle quel sont les attribues le plus importans

In [221]:
max(ridgeRgressorTech.coef_)

0.67297224081182327

In [222]:
ridgeRgressorTech.intercept_

0.0

In [223]:
sorted(zip(ridgeRgressorTech.coef_[:23], techModel.X_train.columns[:23]),reverse=True)

[(0.17512184109204287, 'elec indust'),
 (0.15760814161994055, 'math-physique'),
 (0.15114224221640107, 'bio-chimie'),
 (0.14345473841554349, 'elec'),
 (0.13998072786108823, 'mec gene'),
 (0.13411357728084811, 'commerciale informatique'),
 (0.12648384725490872, 'batiment'),
 (0.11107029237384893, 'commmerciale et adm'),
 (0.10536913411255536, 'electronique g\xc3\xa9n\xc3\xa9rale'),
 (0.10450837417962013, 'sociale'),
 (0.099806768061098408, 'inconnu'),
 (0.095646080946421463, 'pedagogie'),
 (0.09214226541338158, 'latin philo'),
 (0.086655212043990806, 'agrecole'),
 (0.082101481016180769, 'construction'),
 (0.077255216150454956, 'machine outil'),
 (0.063799516866856698, 'nutr'),
 (0.060447588998714184, 'industrielle'),
 (0.056351145746039132, 'mecanique machines outils'),
 (0.054723187053262082, 'agronomie'),
 (0.010831382676751862, 'imprimerie'),
 (0.0, 'secretariat'),
 (0.0, "hotesse d'acceuil")]

Nous remrqouns dans ces resulat que concernant les valeurs de coefficient de notre modèle de regression les options avec un bon score sont les options batiment , elec insustrielle , elec math physique et les option avec un moindre score en technologie soont le soptions imprimerie , latin philo,pedagogie,..

Essayons de verifier les écoles :

In [224]:
sorted(zip(ridgeRgressorTech.coef_[24:236], techModel.X_train.columns[24:236]),reverse=True)[:23]

[(0.12221946644343301, 'itfm/bukavu'),
 (0.11544826021849611, 'vungi'),
 (0.10601692142687072, 'nikisi'),
 (0.10135463091815416, 'Lwanga'),
 (0.093502748892734133, 'it bugabo'),
 (0.090555922000722253, 'cirezi'),
 (0.088121403276466337, 'itk mahamba'),
 (0.079465114538439616, 'katana'),
 (0.079151243770063037, 'technique ind. de mahamba'),
 (0.07774206929450149, 'du lac'),
 (0.076335377719177949, 'mgr guido'),
 (0.075944583258854809, 'avenir'),
 (0.075030717919957607, 'saint michel'),
 (0.072977785314089133, 'de beni'),
 (0.066975528149117669, 'action kusaidiya'),
 (0.061269110455925049, 'namurera'),
 (0.060447588998714233, 'it salama'),
 (0.058463266084065074, 'kambali'),
 (0.058443609291880839, 'itig'),
 (0.057776867646034659, 'iti gombe'),
 (0.054440832383465088, 'uenezaji'),
 (0.052474455779097019, 'humule'),
 (0.05038194941607773, 'malikia wa bingu')]

In [227]:
print (ridgeRgressorTech.coef_[236], techModel.X_train.columns[236])

(0.67297224081182327, 'DIPPERC')


In [228]:
technologie.loc[operator.and_(technologie.SCHOOL_RIGHT=='itfm/bukavu',technologie.CGPA>68)]

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2457,56.0,itfm/bukavu,elec indust,FSTA,76.400002,A,G,A
4071,63.0,itfm/bukavu,machine outil,FSTA,71.199997,A,G,G
4282,54.0,itfm/bukavu,elec indust,FSTA,70.75,D,G,G
4313,51.0,itfm/bukavu,elec,FSTA,68.600002,D,G,G
9958,59.0,itfm/bukavu,mecanique machines outils,FSTA,71.699997,D,G,A
10043,58.0,itfm/bukavu,elec indust,FSTA,79.099998,A,G,A
10051,60.0,itfm/bukavu,elec indust,FSTA,72.5,A,G,D
10142,56.0,itfm/bukavu,elec indust,FSTA,69.650002,D,G,A
10199,65.0,itfm/bukavu,mecanique machines outils,FSTA,83.549999,A,G,D
10200,57.0,itfm/bukavu,mecanique machines outils,FSTA,71.049999,D,G,D


In [229]:
pred,error=techModel.ensembelMethods(predictedValues.loc[technologie.loc[operator.and_(technologie.SCHOOL_RIGHT=='itig',technologie.CGPA>65)].index].dropna())

In [230]:
pred

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3627,0.5749,0.572014,0.558456,0.5551,0.569956,0.67,0.646194
4074,0.605507,0.60569,0.606418,0.605682,0.580733,0.654,0.662018
4106,0.5749,0.572014,0.558456,0.5551,0.569956,0.672,0.646194
7644,0.631069,0.635201,0.652961,0.656045,0.575412,0.7395,0.71965
8460,0.663395,0.666978,0.681573,0.684457,0.611192,0.691,0.695236
8663,0.627324,0.630988,0.646661,0.649316,0.575049,0.723,0.701324
9036,0.601763,0.601477,0.600118,0.598952,0.580368,0.719333,0.665863
9078,0.639209,0.643602,0.663121,0.666249,0.584008,0.807667,0.745544
9295,0.648416,0.650128,0.656371,0.657538,0.609739,0.678,0.688095
9376,0.620486,0.622539,0.63162,0.632601,0.582189,0.662,0.704422


##### Traitement des nouvelles valeurs

Dans cette partie nous allons essayer de predire la valeur du CGPA pour des nouvelles variables qui ne sont pas dans notre ensemble d'apprentissage'

In [86]:
technologie.columns

Index([u'DIPPERC', u'SCHOOL_RIGHT', u'OPTION_RIGHT', u'FAC', u'CGPA',
       u'DistinctionRatio', u'EchecRatio', u'Pass1stSessionRatio'],
      dtype='object')

In [118]:
newStudent={'DIPPERC':0.60,'SCHOOL_RIGHT':'itfm/bukavu','OPTION_RIGHT':'elec indust','CGPA':0}

In [232]:
newStData=pd.DataFrame(newStudent,columns=newStudent.keys(),index=range(1))

In [120]:
newStData

Unnamed: 0,DIPPERC,CGPA,OPTION_RIGHT,SCHOOL_RIGHT
0,0.6,0,elec indust,itfm/bukavu


In [231]:
newStData[['OPTION_RIGHT','SCHOOL_RIGHT']]

Unnamed: 0,OPTION_RIGHT,SCHOOL_RIGHT
0,elec indust,itfm/bukavu


In [64]:
optionEnc=techModel.encoders['OPTION_RIGHT']

In [65]:
schoolEnc=techModel.encoders['SCHOOL_RIGHT']

In [157]:
Options=pd.DataFrame(data=dict(zip(optionEnc.classes_,optionEnc.transform(newStData[['OPTION_RIGHT']])[0])),
             index=newStData.index, columns=optionEnc.classes_)

In [158]:
Schools=pd.DataFrame(data=dict(zip(schoolEnc.classes_,schoolEnc.transform(newStData[['SCHOOL_RIGHT']])[0])),
             index=newStData.index, columns=schoolEnc.classes_)

In [94]:
Schools

Unnamed: 0,54,61,Bungulu Beni,INST DE KATWA,Institut MWANDA,LWANGA,Lwanga,MWANDU,Mwanda,action kusaidiya,...,uhuru,uzima,visoke,vungi,wai wai,wapole,weza,wima,zanner,zawadi ya raisi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
newStData['DIPPERC']

0    0.60
Name: DIPPERC, dtype: object

In [159]:
Schools.reset_index(inplace=True)
Options.reset_index(inplace=True)

In [97]:
techModel.X_train.shape

(722, 237)

In [162]:
New_X=pd.merge(Options,Schools,on='index')

In [163]:
New_X['DIPPERC']=newStData['DIPPERC']

In [164]:
New_X.set_index(keys=['index'],inplace=True)

In [113]:
New_X.shape

(1, 237)

In [109]:
len(ridgeRgressorTech.coef_)

237

In [165]:
ridgeRgressorTech.predict(New_X)

array([ 0.68790015])

In [160]:
techModel.X_train.head(1)

Unnamed: 0_level_0,agrecole,agronomie,batiment,bio-chimie,commerciale informatique,commmerciale et adm,construction,elec,elec indust,electronique générale,...,uzima,visoke,vungi,wai wai,wapole,weza,wima,zanner,zawadi ya raisi,DIPPERC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9486,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.74


In [161]:
New_X.head(1)

Unnamed: 0_level_0,agrecole,agronomie,batiment,bio-chimie,commerciale informatique,commmerciale et adm,construction,elec,elec indust,electronique générale,...,uzima,visoke,vungi,wai wai,wapole,weza,wima,zanner,zawadi ya raisi,DIPPERC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0.6


Essayons de prédire la nouvelles valeurs avec la nouvelle methode

In [233]:
techModel.predictNew(newStData)

AttributeError: 'PredictiveModelByilding' object has no attribute 'X'

In [234]:
def predictNew(self,newStData):
    """this call will handle predictions for new values,but frirst it will endcode them nand then try to predict"""
    #start first by handling categorical values
    optionEnc=self.encoders['OPTION_RIGHT']
    schoolEnc=self.encoders['SCHOOL_RIGHT']
    Options=pd.DataFrame(data=dict(zip(optionEnc.classes_,optionEnc.transform(newStData[['OPTION_RIGHT']])[0])),
             index=newStData.index, columns=optionEnc.classes_)
    Schools=pd.DataFrame(data=dict(zip(schoolEnc.classes_,schoolEnc.transform(newStData[['SCHOOL_RIGHT']])[0])),
             index=newStData.index, columns=schoolEnc.classes_)
    Schools.reset_index(inplace=True)
    Options.reset_index(inplace=True)
    X=pd.merge(Options,Schools,on='index')
    X['DIPPERC']=newStData['DIPPERC']
    X.set_index(keys=['index'],inplace=True)
    predictions={}
    for clf in self.predictiveModels.values():
      predictions[clf.__class__.__name__]= clf.predict(X)
    predictedVal=pd.DataFrame.from_dict(predictions,dtype=np.float)
    predictedVal.set_index(X.index,inplace=True)
    predictedVal.loc[:,'finalOutput']=self.stacker.predict(predictedVal)
    return predictedVal

In [240]:
predictNew(self=techModel,newStData=newStData)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,finalOutput
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.696491,0.698198,0.702101,0.701125,0.628597,0.680711


In [259]:
newStudent={'DIPPERC':0.7,'SCHOOL_RIGHT':'itfm/bukavu','OPTION_RIGHT':'elec'}

In [260]:
newStData2=pd.DataFrame(newStudent,columns=newStudent.keys(),index=range(1))

In [261]:
predictNew(self=techModel,newStData=newStData2)

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,finalOutput
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.702262,0.708521,0.734052,0.736755,0.603966,0.684074


essayons en faculté de Médecine

In [274]:
LassoMed=medecineModel.predictiveModels['Lasso']

In [280]:
len(medecine.OPTION_RIGHT.value_counts())

13

In [281]:
sorted(zip(LassoMed.coef_[:13], medecineModel.X_train.columns[:13]),reverse=True)

[(0.35757368827182023, 'commerciale informatique'),
 (0.34979805279513343, 'commmerciale et adm'),
 (0.34775736681271191, 'math-physique'),
 (0.33985943323756168, 'latin philo'),
 (0.33015491684910009, 'bio-chimie'),
 (0.32823217678920358, 'v\xc3\xa9t\xc3\xa9rinaire'),
 (0.29493952634047493, 'sociale'),
 (0.28920522583779829, 'pedagogie'),
 (0.2819322147819795, 'elec indust'),
 (0.23277919234663122, 'nutr'),
 (0.21194946979189991, 'agrecole'),
 (0.0, 'secretariat'),
 (0.0, 'agronomie')]

Essayons de verifier les écoles :

In [288]:
len(medecine.SCHOOL_RIGHT.value_counts()) #nombre des colones

94

In [291]:
medecineModel.X_train.columns[12]

'v\xc3\xa9t\xc3\xa9rinaire'

In [287]:
sorted(zip(LassoMed.coef_[13:13+94], medecineModel.X_train.columns[13:13+94]),reverse=True)[:23]

[(-0.1441318839144477, 'masisi'),
 (-0.13962352198707761, 'majengo'),
 (-0.12141658508131777, 'gs kigali'),
 (-0.11251800352776459, 'hekima'),
 (-0.10720237938597295, 'ngoma'),
 (-0.095335844000001099, 'de r\xc3\xa9cup\xc3\xa9ration de la gombe'),
 (-0.094776634847224872, 'amen'),
 (-0.089211977017360722, 'neema kwetu'),
 (-0.089042334344549348, 'de bukavu'),
 (-0.086953345697900128, 'Mwangaza'),
 (-0.071324022409169338, 'lukuga'),
 (-0.069783695396088016, 'kalungu'),
 (-0.069305925587554135, 'zanner'),
 (-0.06744266888518366, 'bimenya'),
 (-0.052480726765823014, 'communautaire du lac'),
 (-0.050089729535492775, 'ndosho'),
 (-0.047913018684819217, 'Intitut ALLELUYA'),
 (-0.034522842170380694, 'bethanie'),
 (-0.033870787425376019, 'himbi'),
 (-0.03241010488706185, 'la fontaine'),
 (-0.030346701263338913, 'amani'),
 (-0.028094236685508724, 'matanda'),
 (-0.021924696440913535, 'Lwanga'),
 (-0.020798051866254483, 'icl'),
 (-0.019862713511119887, 'milima'),
 (-0.018174782561455084, 'mont ca

In [296]:
medecineModel.X_train.shape

(193, 108)

In [298]:
print (LassoMed.coef_[107], medecineModel.X_train.columns[107])

(0.44356521067840698, 'DIPPERC')


In [303]:
medecine.loc[operator.and_(medecine.SCHOOL_RIGHT=='amani',True)]

Unnamed: 0_level_0,DIPPERC,SCHOOL_RIGHT,OPTION_RIGHT,FAC,CGPA,DistinctionRatio,EchecRatio,Pass1stSessionRatio
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10116,77.0,amani,bio-chimie,FM,62.950001,G,G,G
10118,72.0,amani,bio-chimie,FM,66.5,G,G,A
10208,76.0,amani,bio-chimie,FM,58.349998,G,G,G
10289,73.0,amani,bio-chimie,FM,40.0,G,A,G
10377,74.0,amani,bio-chimie,FM,58.949999,G,G,D
10437,75.0,amani,bio-chimie,FM,60.75,G,G,A
11269,81.0,amani,bio-chimie,FM,74.0,A,G,A
11781,64.0,amani,bio-chimie,FM,61.0,G,G,A


In [304]:
medecineModel.ensembelMethods()

TypeError: drop() got an unexpected keyword argument 'axis'

In [305]:
pred,error=techModel.ensembelMethods(predictedValues.loc[medecine.loc[operator.and_(medecine.SCHOOL_RIGHT=='amani',True)].index].dropna())

In [306]:
pred

Unnamed: 0_level_0,ElasticNet,Lasso,LinearSVR,Ridge,SVR,RealValue,finalPredict
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10116,0.640802,0.641353,0.647128,0.649566,0.58739,0.6295,0.625
10118,0.620218,0.619175,0.613996,0.616084,0.586137,0.665,0.71875
10208,0.636685,0.636918,0.640501,0.64287,0.58714,0.5835,0.5625
10377,0.628452,0.628046,0.627248,0.629477,0.586639,0.5895,0.59375
10437,0.632568,0.632482,0.633875,0.636173,0.58689,0.6075,0.625
11269,0.657269,0.659096,0.673634,0.676351,0.588391,0.74,0.78125


In [307]:
error

0.029933779762049806

In [308]:
predictNew(self=medecineModel,newStData=newStData2)

ValueError: shapes (1,5) and (3,) not aligned: 5 (dim 1) != 3 (dim 0)