In [5]:
# Similar to the library() statement in R, we use import statements to bring in 3rd party modules
import sklearn  # This imports the scikit-learn module (python's de-facto machine learning library)

# To invoke functions or attributes of the module, we simply append the name of the function or attribute after the 
# import name with a dot (e.g. sklearn.datasets allows us to access the datasets in scikit-learn)

# We can also import pandas as it is, but because we reference the module so much, we can shorten the import name
# to anything we want using "as"
# In the following statement, we import pandas as pd (thus we can call a dataframe using pd.DataFrame, instead of
# pandas.DataFrame)
import pandas as pd  # This imports the pandas module as pd
import numpy as np  # This imports the numpy module as np

In [6]:
# Loading external data files with pandas
# Pandas is able to read multiple data formats into dataframes
# Many datasets that we obtain are in the csv format
wine = pd.read_csv('wine.csv')

In [7]:
# Check the dataframe
# Notice that this is a dataset about wine and the various chemical compositions in it
# Intuitively, we can use this dataset to identify the type of wine (class label 1, 2 or 3) by using 
# statistical and machine learning models
wine

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.640000,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.380000,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.680000,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.800000,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.320000,1.04,2.93,735
5,1,14.20,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.750000,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.50,2.52,0.30,1.98,5.250000,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.60,2.51,0.31,1.25,5.050000,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.80,2.98,0.29,1.98,5.200000,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.220000,1.01,3.55,1045


In [8]:
# describe() allows us to inspect the descriptive statistics of the dataset
wine.describe()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [13]:
# Subsetting data from dataframes
# In pandas, we often use the following 2 techniques to subset data:

# 1. Select entire row
# This can be accomplished by simply calling the column name in one the following form (both achieve the same result):
wine.Alcohol
wine['Alcohol']

0      14.23
1      13.20
2      13.16
3      14.37
4      13.24
5      14.20
6      14.39
7      14.06
8      14.83
9      13.86
10     14.10
11     14.12
12     13.75
13     14.75
14     14.38
15     13.63
16     14.30
17     13.83
18     14.19
19     13.64
20     14.06
21     12.93
22     13.71
23     12.85
24     13.50
25     13.05
26     13.39
27     13.30
28     13.87
29     14.02
       ...  
148    13.32
149    13.08
150    13.50
151    12.79
152    13.11
153    13.23
154    12.58
155    13.17
156    13.84
157    12.45
158    14.34
159    13.48
160    12.36
161    13.69
162    12.85
163    12.96
164    13.78
165    13.73
166    13.45
167    12.82
168    13.58
169    13.40
170    12.20
171    12.77
172    14.16
173    13.71
174    13.40
175    13.27
176    13.17
177    14.13
Name: Alcohol, Length: 178, dtype: float64

In [16]:
# To subset dataframe from a larger dataframe, we specify the required column names with a double square bracket.
wine[['Class label', 'Alcohol', 'Malic acid']]

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.20,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59
5,1,14.20,1.76
6,1,14.39,1.87
7,1,14.06,2.15
8,1,14.83,1.64
9,1,13.86,1.35


In [18]:
# To subset rows and columns, we can use .loc or .iloc
# The format is .loc or .iloc[row slice, column slice]
# Note that slicing follows native python methods (see Lesson 2)
# The difference between loc and iloc is that loc takes the column name, while iloc takes the positional value
wine.iloc[1:10, 3:5]

Unnamed: 0,Ash,Alcalinity of ash
1,2.14,11.2
2,2.67,18.6
3,2.5,16.8
4,2.87,21.0
5,2.45,15.2
6,2.45,14.6
7,2.61,17.6
8,2.17,14.0
9,2.27,16.0


In [21]:
# loc takes the column name
wine.loc[1:10, ['Ash', 'Alcalinity of ash']]

Unnamed: 0,Ash,Alcalinity of ash
1,2.14,11.2
2,2.67,18.6
3,2.5,16.8
4,2.87,21.0
5,2.45,15.2
6,2.45,14.6
7,2.61,17.6
8,2.17,14.0
9,2.27,16.0
10,2.3,18.0


In [25]:
# When used as a numeric slice, the methods returns rows
wine.iloc[:3]  # returns rows 0 to 9

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185


In [31]:
# Adding or changing variables in one swoop
# We can create new variables in pandas dataframe by giving it a column name and value
wine['checked'] = 1  # Creates a new column called checked with the value of 1
wine['today'] = 'Yes'  # Creates a new column called 'Yes'
wine

# We can also use the same technique to replace the values of an existing column with another value

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,checked,today
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.640000,1.04,3.92,1065,1,Yes
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.380000,1.05,3.40,1050,1,Yes
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.680000,1.03,3.17,1185,1,Yes
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.800000,0.86,3.45,1480,1,Yes
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.320000,1.04,2.93,735,1,Yes
5,1,14.20,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.750000,1.05,2.85,1450,1,Yes
6,1,14.39,1.87,2.45,14.6,96,2.50,2.52,0.30,1.98,5.250000,1.02,3.58,1290,1,Yes
7,1,14.06,2.15,2.61,17.6,121,2.60,2.51,0.31,1.25,5.050000,1.06,3.58,1295,1,Yes
8,1,14.83,1.64,2.17,14.0,97,2.80,2.98,0.29,1.98,5.200000,1.08,2.85,1045,1,Yes
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.220000,1.01,3.55,1045,1,Yes


In [32]:
# To drop a column, we use the function .drop([columns], axis=1)
# axis = 0 implies rows and axis = 1 implies columns
wine.drop(['checked', 'today'], axis=1, inplace=True)
wine

# Note the inplace=True. Pandas is conservative in a way that most operations (other than assignment in the above example)
# creates a copy of the new dataframe, leaving the original dataframe unchanged.
# To change the original dataframe, we specify inplace=True

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.640000,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.380000,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.680000,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.800000,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.320000,1.04,2.93,735
5,1,14.20,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.750000,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.50,2.52,0.30,1.98,5.250000,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.60,2.51,0.31,1.25,5.050000,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.80,2.98,0.29,1.98,5.200000,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.220000,1.01,3.55,1045


In [40]:
# Categorical data in Pandas
# The code below constructs a pandas dataframe from a list of list. The general form is pd.DataFrame(list)
# In this example, the elements in the list are lists, with each list taking the form [color, size, price, label]
# When expanded, it looks like [[color1, size1, price1, label1], [color2, size2, price2, label2], ...]
# with each list representing a row of observation
df = pd.DataFrame([
           ['green', 'M', 10.1, 'class1'],
           ['red', 'L', 13.5, 'class2'],
           ['blue', 'XL', 15.3, 'class1']])

# We can name the columns (name of the variables) of the dataframe using df.columns, and specifying a list of
# names we want (note that number of names must be equal to the number of variables)
df.columns = ['color', 'size', 'price', 'classlabel']

# Show the dataframe
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [41]:
# From the lessons in R, we know that size and classlabel are categorical variables.
# size is categorical but ordinal variable (XL > L > M) while classlabel is not (class1 and class2 cannot be ranked)
# In R, we can conveniently convert variables to categorical form, but this convenience is not available in Python.
# Data that goes into any scikit-learn models must be numerical. This means that:
# XL, L, M must be mapped to a numeric format. In this case, since they are ordinal, we can give a higher number to XL
# and a lower number for L, with the lowest number for M

# I show 1 method that we often use to accomplish this.
# First, we create a map of size to ranks (values). This is done by creating a dictionary.
size_mapping = {
    'XL': 3,
    'L': 2,
    "M": 1
}
# We can change the values of a variable in one swoop by applying a map function to the variable column.
# The map function goes by the form .map(function)
# To define a function on the fly, we can use lambda x: do something with x. This is the same as
# def some_function(x):
#    do something with x

df['size'] = df['size'].map(lambda x: size_mapping[x])

# Recall we did assign 1 value to an entire variable. The above method allows us to assign different values
# contingent on the values in another (or the same) column.
# if we use the same column, then this column will be overwritten by the new values.

# Then we use mapping to update the Pandas dataframe
df

# We can achieve the same without specifying the lambda function, as python recognizes that
# we want to apply a dictionary mapping to each element in the variable
# same outcome: df['size'] = df['size'].map(size_mapping)

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [44]:
# Next, we need to handle the color and classlabel categorical variable.
# This is slightly problematic since we cannot rank color and classlabel.
# Recall in the previous lessons, that R as.factor is an abstraction of convenience.
# Under the hood, what happens is that such categorical variables are expanded, each classlabel as a variable and
# value of 0 to indicate 'does not belong to this class', and 1 to indicate 'belongs to this class'

# pandas has a get_dummies function that helps us do this transformation in one statement.
df = pd.get_dummies(df)

# Note that the get_dummies is invoked by passing the dataframe into the pandas' get_dummies function.
# We need to assign this result to a variable. By specifying df, we are overwriting the original df with the new result
df

Unnamed: 0,size,price,color_blue,color_green,color_red,classlabel_class1,classlabel_class2
0,1,10.1,0,1,0,1,0
1,2,13.5,0,0,1,0,1
2,3,15.3,1,0,0,1,0


In [46]:
# Coming back to the wine dataset, let's do some machine learning on it.
wine.head()  # returns top 5 rows to refresh our memory on how the dataset looks like

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [48]:
# Recall in our previous lesson on in-sample bias, we need to split the dataset into
# training and testing
# We then train our model on the training dataset, get the fitted model, and use it to predict
# on the testing dataset
# Following that, we investigate the accuracy of our model

# Spliting the dataset is easy in python using the scikit-learn train_test_split function
from sklearn.model_selection import train_test_split

# First, we need to split up the dataset into the outcome variable, y
# and the dataframe of explanatory variables X
# We achieve this by using iloc (outcome variable is Class label, which is column 0)
# The dataframe of explanatory variables is columns 1 to the end
# In python slicing, omitting the numbers before and after : implies grab everything before and/or after
X, y = wine.iloc[:, 1:].values, wine.iloc[:, 0].values

# Applying the train_test_split function returns 4 datasets:
# training set for X, testing set for X, training set for y, testing set for y
# we can specify the proportion of test set with the test_size argument (0.3 is 30%)
# random_state is an argument that forces the algorithm to return the same result if the same data is used
# without specifying the same number for random_state, we may get different results even with the same dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [49]:
# Feature scaling, which we will learn in the next section, is an important 
# step in ML algorithms especially gradient descent and KNN
# Feature scaling transforms the data into a more well-behaved distribution
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()  # We use MinMaxScaler() in this example, there are many other scaler available in scikit-learn

# We then apply the scaling to scale our train and test datasets
# We can either overwrite the datasets, but in this case, we created new copies with the transformed values
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)  # Use this on any new data

In [51]:
# In this example, we use a scaling feature that most people know - standardization (or normalization)
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()  # Call the standard scaler class

# Apply the scaling to normalize our train and test datasets
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)  # Use this on any new data

In [52]:
# With the above done, let's run a logistic regression on the data
from sklearn.linear_model import LogisticRegression

# We applied a L1 regularization to this model (penalty). 
# As we will learn in the slides, this will help our predictive accuracy.
lr = LogisticRegression(penalty='l1', C=0.1)  # Call the logistic regression model

# Fit the data
lr.fit(X_train_std, y_train)

# The lr model also comes inbuilt accuracy scoring, invoke it using .score(X, y)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

# The close results between the training test accuracies do not indicate overfitting issues

Training accuracy: 0.9838709677419355
Test accuracy: 0.9814814814814815


In [55]:
# For some machine learning models, it comes equipped with the ability to rank the importance of
# our explanatory variables. Note that in machine learning speak, explanatory variables are
# known as features.

# Let's try a powerful famous ML algorithm known as random forest (I will introduce
# this model in the slides)
from sklearn.ensemble import RandomForestClassifier
feat_labels = wine.columns[1:]  # Extract the names of the columns so that I can merge the importance value with the names later

# Call the Random Forest model
forest = RandomForestClassifier(n_estimators=100,
                               random_state=0,
                               n_jobs=-1)  # n_jobs is a useful attribute in scikit-learn, it tells python how many CPUs to use. -1 means use all CPUs. More CPUs = faster completion.

forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [57]:
# The importance values are captured in .feature_importances_
importances = forest.feature_importances_
importances

array([0.0887468 , 0.02978162, 0.01663817, 0.03918994, 0.02296177,
       0.06157894, 0.14522827, 0.01425818, 0.02310746, 0.19481937,
       0.08026645, 0.11907506, 0.16434795])

In [58]:
# sort and return the positions, not the value of the elements, in descending order
indices = np.argsort(importances)[::-1]

# To facilitate our print statement that shows the importance of each feature in predicting wine
for f in range(X_train.shape[1]):
    print('%2d) %-*s %f' % (f + 1, 30, feat_labels[f], importances[indices[f]]))

 1) Alcohol                        0.194819
 2) Malic acid                     0.164348
 3) Ash                            0.145228
 4) Alcalinity of ash              0.119075
 5) Magnesium                      0.088747
 6) Total phenols                  0.080266
 7) Flavanoids                     0.061579
 8) Nonflavanoid phenols           0.039190
 9) Proanthocyanins                0.029782
10) Color intensity                0.023107
11) Hue                            0.022962
12) OD280/OD315 of diluted wines   0.016638
13) Proline                        0.014258
