In [1]:
# import display libs
from IPython.display import Image
%matplotlib inline
from IPython.display import Latex

In [2]:
# import libs
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


# setup figure params
figureparams = {'axes.labelsize': 24,
           'axes.titlesize': 20,
           'axes.linewidth': 1.3,
           'font.size': 20,
           'legend.fontsize': 18,
           'figure.figsize': (10,7),
           'font.family': 'serif',
           'font.serif': 'Computer Modern Roman',
           'xtick.labelsize': 18,
           'xtick.major.size': 5.5,
           'xtick.major.width': 1.3,
           'ytick.labelsize': 18,
           'ytick.major.size': 5.5,
           'ytick.major.width': 1.3,
           'text.usetex': True,
           'figure.autolayout': True}
plt.rcParams.update(figureparams)
matplotlib.rcParams['text.usetex']=False
matplotlib.rcParams['text.latex.unicode']=True
matplotlib.get_configdir()

The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.


'C:\\Users\\RJANSEN\\.matplotlib'

# Binary Classification

In [3]:
# Import additional libs
#import warnings
#import itertools
#import statsmodels.api as sm

### Load Data

In [4]:
# Load the data
df = pd.read_csv('avocado.csv')
cols = df.columns
print(cols)

Index(['Unnamed: 0', 'Date', 'AveragePrice', 'Total Volume', '4046', '4225',
       '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type',
       'year', 'region'],
      dtype='object')


In [5]:
idx_nums = np.where(df.dtypes == 'float64')
idx_cats = np.where(df.dtypes != 'float64')
numvars = cols[idx_nums]
cats = cols[idx_cats]

### Variable selection

In [6]:
keep_cols = ['Total Volume',
             '4046',
             '4225',
             '4770',
             'Total Bags',
             'Small Bags',
             'Large Bags',
             'XLarge Bags',
             'region',
             'type' ]

# convert target variable to 0,1
df.loc[df.type == 'conventional', 'type'] = 0
df.loc[df.type == 'organic', 'type'] = 1


df = df[keep_cols]

In [7]:
df

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,region,type
0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,Albany,0
1,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,Albany,0
2,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,Albany,0
3,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,Albany,0
4,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,Albany,0
...,...,...,...,...,...,...,...,...,...,...
18244,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,WestTexNewMexico,1
18245,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,WestTexNewMexico,1
18246,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,WestTexNewMexico,1
18247,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,WestTexNewMexico,1


### Train test split

In [29]:
def split(df, target, test_size = 0.25):
    """
    Function that splits a dataset into train and test set.
    ---------
    Input:
        df          dataframe; to be splitted data   
        target:     string; name of dependent variable        
    Returns:
        X_train     dataframe; train data
        X_test      dataframe; test data
        y_train     array; train labels
        y_test      array; test labels
    
    """
    from sklearn.model_selection import train_test_split
    
    # split in train and test
    X_train, X_test, y_train, y_test = train_test_split(df.drop([target], axis=1), 
                                                        df[target],
                                                        test_size = test_size,
                                                        random_state=101)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split(df, 'type')

In [30]:
### label encode the categorical values and convert them to numbers 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(X_train['region'].astype(str))
X_train['region'] = le.transform(X_train['region'].astype(str))
X_test['region'] = le.transform(X_test['region'].astype(str))


In [31]:
X_train

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,region
9247,12626.20,2524.44,5164.04,715.02,4222.70,4219.94,2.76,0.00,2
378,161443.98,25922.66,46906.06,41261.83,47353.43,38940.45,8412.98,0.00,7
11872,330353.76,119664.32,178258.33,176.53,32254.58,14940.27,17314.31,0.00,52
16335,18891.80,0.00,390.00,0.00,18501.80,18273.13,228.67,0.00,30
5718,755472.09,320378.00,135920.57,179.13,298994.39,177899.15,121095.24,0.00,1
...,...,...,...,...,...,...,...,...,...
5695,528297.88,248095.78,56275.22,109.99,223816.89,136027.87,77103.34,10685.68,1
8006,4362824.00,2377638.00,713646.00,14770.00,1256771.00,870931.00,385677.00,162.00,45
17745,24825.50,6516.40,685.92,44.09,17579.09,16514.58,1064.51,0.00,12
17931,7801.52,183.18,130.50,0.00,7487.84,7163.33,324.51,0.00,27


In [32]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['region'])], axis=1)
X_train = X_train.drop(['region'], axis=1)

In [33]:
X_test = pd.concat([X_test, pd.get_dummies(X_test['region'])], axis=1)
X_test = X_test.drop(['region'], axis=1)

### Variable preparation

In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

### Model: Logistic Regression

In [35]:
def GS_clf(penalty = ['l1'], C = np.logspace(-7, 7, 29), cv=5):
    """
    Function that sets up the GridSearch model with crossvalidation.
    ---------
    Input:

    Returns:
        clf         object; to be fitted classifier
    
    """
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    
    # set up logistic regression
    lr = LogisticRegression(C=1)
    
    # define grid of hyper parameters
    param_grid = [
            {'penalty' : ['l1', 'l2'],
            'C' : np.logspace(-7, 7, 29),
            'solver' : ['liblinear']},
        ]    
    
    # initialize classifier
    clf = GridSearchCV(lr, param_grid, scoring = 'roc_auc', n_jobs=-1, iid =True, refit = True, 
                           cv=cv, verbose=True, error_score='raise')
    
    return clf

clf = GS_clf(cv=2)  # this shld be higher when fitting for real
clf.fit(X_train, y_train)
pred_proba = clf.predict_proba(X_test)[:, 1]

# extract coefficients
coef = clf.best_estimator_.coef_[0]
intercept = clf.best_estimator_.intercept_[0]

Fitting 2 folds for each of 58 candidates, totalling 116 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 116 out of 116 | elapsed:   18.6s finished


### Scoring

In [36]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

print("Our Gini is: " + str(2*roc_auc_score(y_test, pred_proba)-1))
print("\n")
print("The confusion matrix is: ")
confusion_matrix(y_test, np.round(pred_proba))

Our Gini is: 0.9993971427100374


The confusion matrix is: 


array([[2276,    0],
       [   2, 2285]], dtype=int64)

### Model interpretation

In [16]:
clf.best_estimate_ ###?

AttributeError: 'GridSearchCV' object has no attribute 'best_estimate_'

look at the first coefficient!

In [37]:
coef

array([-2.73683286e-05, -1.09651605e-04, -1.41465046e-04, -2.43365958e-03,
       -1.37796163e-05, -2.13367134e-05, -3.69459973e-05, -2.17386099e-02,
        6.11410301e-01,  1.03204283e+01,  1.75603633e+01,  1.50619288e+00,
        1.08860234e+01,  2.18747649e+00,  4.60969821e+01,  1.61652443e+01,
        1.64744495e+01,  9.03592954e+00,  5.35123488e+00,  1.63657529e+01,
        2.66940912e+01,  1.09079825e+01,  7.38592232e+00,  3.71581643e+01,
        8.24032574e+00,  1.03471230e+01,  1.46240586e+01,  5.79144994e+00,
        3.96886054e+00,  9.23320198e+00,  2.07080014e+01,  1.05885251e+00,
        9.96729155e+00,  6.72981651e+01,  6.42775656e+00,  9.31487499e+00,
        7.77200144e+01,  4.87388204e+01,  1.01676530e+01,  8.31127288e+00,
        1.41977056e+01,  1.37896952e+01,  2.66379189e+00,  3.36039298e+01,
        1.64142636e+01,  1.17116930e+01,  9.14740888e+00,  3.18532314e+00,
        1.10506301e+01,  1.21361733e+01,  1.42624922e+01,  2.45655653e+01,
        1.05899335e+01,  