In [1]:
import pandas as pd

In [2]:
dataframe = pd.read_csv('data/Admission_Predict.csv')
dataframe.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
dataframe.shape

(400, 9)

In [4]:
dataframe.dtypes

Serial No.             int64
GRE Score              int64
TOEFL Score            int64
University Rating      int64
SOP                  float64
LOR                  float64
CGPA                 float64
Research               int64
Chance of Admit      float64
dtype: object

In [5]:
dataframe.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [6]:
dataframe.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [7]:
dataframe.drop('Serial No.', axis=1, inplace=True)
dataframe.tail()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
395,324,110,3,3.5,3.5,9.04,1,0.82
396,325,107,3,3.0,3.5,9.11,1,0.84
397,330,116,4,5.0,4.5,9.45,1,0.91
398,312,103,3,3.5,4.0,8.78,0,0.67
399,333,117,4,5.0,4.0,9.66,1,0.95


In [8]:
y = dataframe['Chance of Admit ']
x = dataframe.drop('Chance of Admit ', axis=1)

In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, cross_val_score
from joblib import Parallel, delayed

In [10]:
# função para treinar e avaliar o modelo
def train_and_score(model, x, y):
    kfold = KFold(n_splits=10)
    return cross_val_score(model, x, y, cv=kfold).mean()

In [11]:
# função para criar e avaliar modelos de regressão
def regression_models(x, y, ridge_params={}, lasso_params={}, elastic_params={}):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(**ridge_params),
        'Lasso': Lasso(**lasso_params),
        'ElasticNet': ElasticNet(**elastic_params)
    }

    # avaliação dos modelos em paralelo
    scores = Parallel(n_jobs=-1)(
        delayed(train_and_score)(model, x, y) for model in models.values()
    )

    # criação do dicionário com os resultados
    results = dict(zip(models.keys(), scores))

    print(f'O melhor modelo foi o: {max(results, key=results.get)} com o valor de: {results[max(results, key=results.get)]}')

    return results

In [12]:
regression_models(
    x, y,
    ridge_params={'alpha': 1.0},
    lasso_params={'alpha': 1.0},
    elastic_params={'alpha': 1.0, 'l1_ratio': 0.5}
)

O melhor modelo foi o: Ridge com o valor de: 0.77057254829287


{'LinearRegression': 0.7703825020879943,
 'Ridge': 0.77057254829287,
 'Lasso': 0.1827727619979103,
 'ElasticNet': 0.4962376743697495}