In [1]:
# import display libs
from IPython.display import Image
%matplotlib inline
from IPython.display import Latex

In [2]:
# import libs
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


# setup figure params
figureparams = {'axes.labelsize': 24,
           'axes.titlesize': 20,
           'axes.linewidth': 1.3,
           'font.size': 20,
           'legend.fontsize': 18,
           'figure.figsize': (10,7),
           'font.family': 'serif',
           'font.serif': 'Computer Modern Roman',
           'xtick.labelsize': 18,
           'xtick.major.size': 5.5,
           'xtick.major.width': 1.3,
           'ytick.labelsize': 18,
           'ytick.major.size': 5.5,
           'ytick.major.width': 1.3,
           'text.usetex': True,
           'figure.autolayout': True}
plt.rcParams.update(figureparams)
matplotlib.rcParams['text.usetex']=False
matplotlib.rcParams['text.latex.unicode']=True
matplotlib.get_configdir()

The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.


'C:\\Users\\EdwardJansenADC\\.matplotlib'

# Binary Classification

In [4]:
# Import additional libs
#import warnings
#import itertools
#import statsmodels.api as sm

### Load Data

In [7]:
# Load the data
df = pd.read_csv('avocado.csv')
cols = df.columns
print(cols)

Index(['Unnamed: 0', 'Date', 'AveragePrice', 'Total Volume', '4046', '4225',
       '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type',
       'year', 'region'],
      dtype='object')


In [6]:
idx_nums = np.where(df.dtypes == 'float64')
idx_cats = np.where(df.dtypes != 'float64')
numvars = cols[idx_nums]
cats = cols[idx_cats]

### Variable selection

In [14]:
keep_cols = ['AveragePrice',
             'Total Volume',
             '4046',
             '4225',
             '4770',
             'Total Bags',
             'Small Bags',
             'Large Bags',
             'XLarge Bags',
             'type' ]

# convert target variable to 0,1
df.loc[df.type == 'conventional', 'type'] = 0
df.loc[df.type == 'organic', 'type'] = 1

df = df[keep_cols]

In [15]:
df

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type
0,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,0
1,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,0
2,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,0
3,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,0
4,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,0
...,...,...,...,...,...,...,...,...,...,...
18244,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,1
18245,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,1
18246,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,1
18247,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,1


### Train test split

In [17]:
def split(df, target, test_size = 0.25):
    """
    Function that splits a dataset into train and test set.
    ---------
    Input:
        df          dataframe; to be splitted data   
        target:     string; name of dependent variable        
    Returns:
        X_train     dataframe; train data
        X_test      dataframe; test data
        y_train     array; train labels
        y_test      array; test labels
    
    """
    from sklearn.model_selection import train_test_split
    
    # split in train and test
    X_train, X_test, y_train, y_test = train_test_split(df.drop([target], axis=1), 
                                                        df[target],
                                                        test_size = test_size,
                                                        random_state=101)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split(df, 'type')

### Variable preparation

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

### Model: Logistic Regression

In [37]:
def GS_clf(penalty = ['l1'], C = np.logspace(-7, 7, 29), cv=5):
    """
    Function that sets up the GridSearch model with crossvalidation.
    ---------
    Input:

    Returns:
        clf         object; to be fitted classifier
    
    """
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    
    # set up logistic regression
    lr = LogisticRegression(C=1)
    
    # define grid of hyper parameters
    param_grid = [
            {'penalty' : ['l1', 'l2'],
            'C' : np.logspace(-7, 7, 29),
            'solver' : ['liblinear']},
        ]    
    
    # initialize classifier
    clf = GridSearchCV(lr, param_grid, scoring = 'roc_auc', n_jobs=-1, iid =True, refit = True, 
                           cv=cv, verbose=True, error_score='raise')
    
    return clf

clf = GS_clf(cv=2)  # this shld be higher when fitting for real
clf.fit(X_train, y_train)
pred_proba = clf.predict_proba(X_test)[:, 1]

# extract coefficients
coef = clf.best_estimator_.coef_[0]
intercept = clf.best_estimator_.intercept_[0]

Fitting 2 folds for each of 58 candidates, totalling 116 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 116 out of 116 | elapsed:    7.3s finished


### Scoring

In [42]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

print("Our Gini is: " + str(2*roc_auc_score(y_test, pred_proba)-1))
print("\n")
print("The confusion matrix is: ")
confusion_matrix(y_test, np.round(pred_proba))

Our Gini is: 0.9759437271719194


The confusion matrix is: 


array([[2064,  212],
       [  50, 2237]], dtype=int64)

### Model interpretation

In [36]:
clf.best_estimate_ ###?

AttributeError: 'GridSearchCV' object has no attribute 'best_estimate_'

look at the first coefficient!

In [43]:
coef

array([ 2.41818174e+00, -4.53670781e-06, -2.44824400e-05, -1.90442522e-05,
       -2.27642963e-04,  1.01752942e-05,  8.79686756e-06,  1.93681081e-05,
       -2.92130076e-02])