### Logistic Regression Hyper Parameters

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
dir(LogisticRegression)

['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getstate__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_estimator_type',
 '_get_param_names',
 '_predict_proba_lr',
 'decision_function',
 'densify',
 'fit',
 'get_params',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'sparsify']

#### C hyper paramters is a regularization parameter that control how closely the model fits to the training data
* C = 1/ lambda where lambda is the regularization parmeter.

* When lamda is zero, then C will be close to infinity(Low Regularization -> High complexity -> Overfit)
* When lambda is high, C will be less (High Regularization -> Low complexity -> Underfit)

* k fold cross validation will take your dataset and split in to k subsets.
* It will iterate through k subsets k times
* On each loop it will fit a model(train) k-1 subset and test on the remaining subset
* Generate performance metrics for each loop


In [41]:
import joblib # pickle the model and save it
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV # use for hyper parmeter tuning
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [42]:
train_features = pd.read_csv('train_features.csv')

# if we dont put header = None, then automatically it will assume that the first record will be the header
train_labels = pd.read_csv('train_labels.csv', header=None)

In [43]:
# print the results
def print_results(results):
    print("Best params is {}".format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
          print('{} (+/-){} for {}'.format(round(mean, 3), round(std *2, 3), params))
          
          

In [35]:
type(train_labels)

pandas.core.frame.DataFrame

In [44]:
logistic_regression = LogisticRegression()
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
# cv=5 indicates that there dataset will be subset to 5
cv = GridSearchCV(logistic_regression, parameters, cv=5)

# values are now n vector form
# we convert it to array using shape.ravel()
cv.fit(train_features, train_labels.values.ravel())
print_results(cv)

# The above code will basically take the first parameter (0.001) and pass it through Logistic regression
# Here we are doing 5 fold cross validation which will loop through 5 times 4 for training and one for testing
# Average test score for the loop will be stored

Best params is {'C': 10}
0.674 (+/-)0.083 for {'C': 0.001}
0.706 (+/-)0.104 for {'C': 0.01}
0.8 (+/-)0.115 for {'C': 0.1}
0.801 (+/-)0.126 for {'C': 1}
0.803 (+/-)0.118 for {'C': 10}
0.8 (+/-)0.113 for {'C': 100}
0.8 (+/-)0.113 for {'C': 1000}


#### The above results shows that when C is very low(which means high reegularization), then models are not working well because of Underfitting.

#### Similarly when C is high(which means low regularization), models are not performing well because of overfitting

In [45]:
# shows us that the model gives good performance when c = 10
cv.best_estimator_

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Write the pickled model

In [46]:
# we will save the model with the fit data which can be used for future purposes
joblib.dump(cv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']