# Estimator CV

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

## Dataset

In [None]:
# Creating a sample dataset
data = make_classification(n_samples= 5000,
                         n_features= 9,
                         n_classes=2,
                         random_state=42)

X = pd.DataFrame(data[0], columns=['V' + str(i) for i in range(1,10)])
y= data[1]

In [None]:
X.shape

(5000, 9)

In [None]:
y.shape

(5000,)

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Model

In [None]:
# Logistic Regression
model = LogisticRegression()
# Fit
model.fit(X_train, y_train)
#score
model.score(X_test, y_test)

0.856

In [None]:
# Logistic Regression with CV
model_cv = LogisticRegressionCV(cv=10, random_state=42)
# Fit
model_cv.fit(X_train, y_train)
#score
model_cv.score(X_test, y_test)

0.856

In [None]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

In [None]:
clf = LogisticRegressionCV(cv=10, random_state=0).fit(X, y)

In [None]:
clf.score(X, y)

0.961335676625659

In [None]:
clf2 = LogisticRegression().fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
clf2.score(X,y)

0.9472759226713533

# Pipeline

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns

In [327]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

In [17]:
# Create dataframe
amt1 = np.random.randint(100,999, 1000)
amt2 = np.random.choice(np.random.randint(1500,200000), 1000)
b= np.random.randn(1000)
x = 0.195
x2 = 0.377

df = pd.DataFrame({
    'group': np.random.choice([1, 2, 3], 1000),
    'amt1': amt1,
    'amt2': amt2,
    'type': np.random.choice([11, 12, 21, 22, 3], 1000),
    'result': ((amt1*x) + (amt2*x2) + b)
})

In [24]:
df.head(3)

Unnamed: 0,group,amt1,amt2,type,result
0,3,200,86113,21,32503.72554
1,1,635,81879,3,30991.859992
2,2,909,94682,3,35871.472004


In [18]:
# Split X and y
X= df.drop('result', axis=1)
y=df.result

In [20]:
# Create the steps to be performed
steps = [('scale', StandardScaler()),
         ('LR', LinearRegression()) ]
pipe = Pipeline(steps)

# Fit the pipeline
pipe.fit(X,y)

Pipeline(steps=[('scale', StandardScaler()), ('LR', LinearRegression())])

In [21]:
pipe.score(X, y)

0.9999999932104651

In [131]:
pipe.predict(X.iloc[[0]])

array([15935.34117995])

In [132]:
df.iloc[[0]]

Unnamed: 0,group,amt1,amt2,type,result
0,2,889,41809,3,15935.510978


### Numerical and Categorical

In [43]:
from sklearn.tree import DecisionTreeRegressor

In [25]:
# Tips
df = sns.load_dataset('tips')

In [42]:
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


### Split X, Y

In [59]:
X = df.drop('tip', axis=1)
y = df.tip 

In [60]:
# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

## Creating and gathering pipelines

In [353]:
cat_steps = [('OHE', OneHotEncoder(handle_unknown='ignore'))]
pipe_cat = Pipeline(cat_steps)

In [354]:
num_steps = [('scale', StandardScaler()) ]
pipe_num = Pipeline(num_steps)

In [370]:
from sklearn.compose import ColumnTransformer

# Extracting the names of the numerical variables
numerical_vars = X.select_dtypes('number').columns.tolist()
# Extracting the names of the categorical variables
categorical_vars = X.select_dtypes('category').columns.tolist()

# Creating the multilayer pipe
one_pipe = ColumnTransformer(transformers=[
        ('categories', pipe_cat, categorical_vars),
        ('numbers', pipe_num, numerical_vars),
        
])

In [582]:
# Final Pipeline
modeling = Pipeline([('preprocess', one_pipe),
                     ('feature_selection', SelectKBest(score_func=f_regression, k=5)),
                    ('model', DecisionTreeRegressor(criterion='absolute_error')) ])

In [609]:
# Fit the modeling pipeline
modeling.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('categories',
                                                  Pipeline(steps=[('OHE',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['sex', 'smoker', 'day',
                                                   'time']),
                                                 ('numbers',
                                                  Pipeline(steps=[('scale',
                                                                   StandardScaler())]),
                                                  ['total_bill', 'size'])])),
                ('feature_selection',
                 SelectKBest(k=5,
                             score_func=<function f_regression at 0x7f2b8949ccb0>)),
                ('model', DecisionTreeRegressor(criterion='absolute_error'))])

In [610]:
modeling.score(X_train, y_train)

0.9899085278604604

In [611]:
modeling.score(X_test, y_test)

0.2946235650833341

In [617]:
# Comparison Predictions vs actual
preds = modeling.predict(X_test)
pd.DataFrame({'y_true': y_test, 'prediction': preds})[15:25]

Unnamed: 0,y_true,prediction
63,3.76,3.0
94,3.25,2.0
7,3.12,4.34
159,2.0,3.5
120,2.31,1.63
184,3.0,3.55
200,4.0,1.36
83,5.0,2.56
202,2.0,2.0
21,2.75,3.21


In [613]:
# Measure MEA
mean_absolute_error(y_test, preds)

0.8504081632653061

In [614]:
np.mean(y_test)

3.111836734693878

In [619]:
# Variance of our error
mean_absolute_error(y_test, preds) / np.mean(y_test) 

0.2732817418677859