## Building Predictive Models (part 2)
Supervised learning with classification and regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, make_scorer, recall_score, precision_score

In [None]:
# from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        # plot all samples
        if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
            X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
            warnings.warn('Please update to NumPy 1.9.0 or newer')
        else:
            X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')

### Decision tree

In [None]:
nonlin_df = pd.read_csv('Data/nonlinear_data.csv')
nonlin_df.head()

X = nonlin_df.loc[:,['x0', 'x1']].values
y = nonlin_df.loc[:,'y'].values

In [None]:
tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=3)

tree.fit(X,y)

plot_decision_regions(X, y, classifier=tree)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Decision Tree classifier')

plt.show()

### Random Forest (RF)
an ensemble of decision trees. Ensemble of learning: combining weak learners to build a strong learner

In [None]:
# from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', 
                                n_estimators=10, # number of trees in forest
                                n_jobs=2)

forest.fit(X, y)

plot_decision_regions(X, y, classifier=forest)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Decision Tree classifier')

plt.show()

### One-hot encoding

In [None]:
shoe_df = pd.DataFrame([['Green', 50.50],
                        ['Red', 93.50],
                        ['Blue', 35.30]])

shoe_df.columns = ['Color', 'Price']
shoe_df

In [None]:
# from sklearn.preprocessing import LabelEncoder

X = shoe_df.values

color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('Color', OneHotEncoder(), [0])], remainder='passthrough')

ct.fit_transform(X)

### Feauture engineering (cancer data)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

cancer_df = pd.read_csv('Data/cancer_data.csv')
cancer_df.head()

In [None]:
# from sklearn.preprocessing import LabelEncoder
# Tranforming class labels to 0 and 1 
X = cancer_df.iloc[:,2:].values
y = cancer_df.iloc[:,1].values

le = LabelEncoder()
y = le.fit_transform(y)

print('[M B] labels ->', le.transform(['M', 'B']))


In [None]:
forest = RandomForestClassifier(n_estimators=1000, # number of decicion trees
                                n_jobs = 5)

forest.fit(X, y)

feat_labels = cancer_df.columns[1:]
importances = forest.feature_importances_
indices = np.argsort(importances[::-1])

for f in range(5):
    print('%2d) %-*s %f' % (f + 1,
                            30, 
                            feat_labels[indices[f]],
                            importances[indices[f]]))



In [None]:
# Plotting the importance of the features

plt.title('Feature importance')
plt.bar(range(X.shape[1]),
        importances[indices], 
        color = 'blue',
        align = 'center')

plt.xticks(range(X.shape[1]),
           feat_labels[indices],
           rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

#### Quiz

In [None]:
diabete_df = pd.read_csv('Data/diabetes_dataset.csv')
diabete_df.head()

In [None]:
X = diabete_df.iloc[:,:-2].values
y = diabete_df.iloc[:,-1].values

le = LabelEncoder()
y = le.fit_transform(y)

print('[tested_positive tested_negative] --> labels', le.transform(['tested_positive', 'tested_negative']))

In [None]:
# Build a RandomForest model and print the top 5 most important features
forest = RandomForestClassifier()
forest.fit(X,y)

feat_labels = diabete_df.columns[1:]
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(5):
    print('%2d) %-*s %f' % (f + 1,
                            30, 
                            feat_labels[indices[f]],
                            importances[indices[f]]))

In [None]:
# Plotting the importance of the features

plt.title('Feature importance')
plt.bar(range(X.shape[1]),
        importances[indices], 
        color = 'blue',
        align = 'center')

plt.xticks(range(X.shape[1]),
           feat_labels[indices],
           rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

### Regression 
Supervised learning - predicting a continuous variable 

In [None]:
df_house = pd.read_csv('Data/df_house_ma.csv')
df_house.head()

#### Scatter plot matrix

In [None]:
col_to_plot = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
pd.plotting.scatter_matrix(df_house.loc[:,col_to_plot],
                           figsize=(10,10))

plt.show()

#### Correlation

In [None]:
data = df_house[col_to_plot].values.T
cm = np.corrcoef(data)
cm = np.around(cm, decimals = 2)
print(cm)

#### Heatmap

In [None]:
fig = plt.figure(figsize=(8,8))
ax = plt.imshow(cm, cmap='bwr')
plt.xticks(np.arange(5), col_to_plot)
plt.yticks(np.arange(5), col_to_plot)
cbar = fig.colorbar(ax)
plt.show()

#### Standardization

In [None]:
X = df_house[['RM']].values
y = df_house['MEDV'].values # Choosing the columns

# from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
sc_y = StandardScaler()
X_std = sc_x.fit_transform(X) # Standardizing values 
y_std = sc_y.fit_transform(y[:,np.newaxis]) # Standardizing values


#### Linear Regression

In [None]:
# from sklearn.linear_model import LinearRegression

slr = LinearRegression()
slr.fit(X_std, y_std)
y_pred = slr.predict(X_std)

plt.scatter(X_std, y_std, c ='blue')
plt.plot(X_std, y_pred, c='red')
plt.xlabel('# rooms [RM]')
plt.ylabel('price [MEDV]')
plt.title('Linear Regression')

In [None]:
w0 = np.around(slr.intercept_[0], decimals=2)
w1 = np.around(slr.coef_[0][0], decimals=2)
print(f'y = {w0} + {w1} * X')

#### Quiz
The company asks us to analyze the dataset and:
* compute the correlation between features(correlation analysis)
* build a linear regression model
* predict the revenue of these upcoming films in [$]

In [None]:
movie_df = pd.read_csv('Data/movie-data-clean.csv')
movie_df.head()

In [None]:
columns_ = 'budget', 'vote_count', 'revenue'
pd.plotting.scatter_matrix(movie_df.loc[:,columns_],
                           figsize=(8,8))

plt.show()

In [None]:
# data = movie_df[columns_].values.T
# cm = np.corrcoef(data)
# cm = np.around(cm, decimals=2)
# print(cm)


In [None]:
fig = plt.figure(figsize=(8,8))
ax = plt.imshow(cm,cmap='bwr')
len_ = len(columns_)
plt.xticks(np.arange(len_), columns_)
plt.yticks(np.arange(len_), columns_)
cbar = fig.colorbar(ax)
plt.show()

In [None]:
## Se more in slides

### Evaluation 

In [None]:
moon_df = pd.read_csv('Data/moon_data.csv')
moon_df.head()

In [None]:
X = moon_df.loc[:,['x0','x1']].values
y = moon_df.loc[:, 'y'].values

#### VIzualizing the data

In [None]:
plt.scatter(X[y==1,0],
            X[y==1,1],
            c= 'b', marker = 'x',
            label = 'class label 1')
plt.scatter(X[y==-1,0],
            X[y==-1,1],
            c= 'r', marker = 's',
            label = 'class label 2')

plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Moon dataset')

plt.show()

In [None]:
### Decision Tree Classifier
# from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)

tree.fit(X,y)
plot_decision_regions(X, y, classifier=tree)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Decision Tree')

plt.show()

### Random forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', n_estimators=10, n_jobs=2)

forest.fit(X, y)

plot_decision_regions(X, y, classifier=forest)
plt.legend()
plt.xlabel('X0')
plt.ylabel('X1')
plt.title('Random Forest Classifier')

plt.show()

#### Evaluating Random Forest Classifier

In [None]:
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

forest = RandomForestClassifier(n_estimators=10000, n_jobs=5)

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

In [None]:
# from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=forest, 
                         X = X_train, 
                         y = y_train,
                         cv = 5, 
                         n_jobs= 1)

In [None]:
# from sklearn.metrics import f1_score, make_scorer, recall_score, precision_score

print('Accuracy: %.3f' % forest.score(X_test, y_test))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1: %.3f' % f1_score(y_test, y_pred))

In [None]:
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))