## Python General

List Comprehension

In [None]:
[f(x) for x in iterable]

In [1]:
import numpy as np
import pandas as pd

## Sklearn

### General Syntax (Sklearn)

In [None]:
from sklearn.family import Model ## call ML model objects
model = Model() # Create model instance
model.fit(X, y)

# Supervised
model.predict(X_test) # Generate either number (for regression) or classes (for classification)
model.predict_proba(X_test) # Generate probability estimates for classification
model.score()

# Unsuprevised
model.predict()
model.transform()
model.fit_transform()

### Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer
# Treat it like Unsupervised model - fit and transform

# For custom / log transform use function transformer
transformer = FunctionTransformer(np.log1p, validate=True)
transformer.transform(X)

### Model Selection

CV and Train-Test-Split

In [4]:
from sklearn.model_selection import train_test_split,LeaveOneOut, KFold, cross_val_score

train_test_split(X, y, test_size)
cross_val_score(model, X, y, cv=LeaveOneOut() or KFolds(Number of splits), scoring, fit_params)

### Model Selection: Hyperparameter Optimisation

GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Example for SVCs

# Dictionary and keys
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3, cv = KFolds(Number of Splits))
grid.fit(X_train,y_train)
grid.best_params_
grid.best_estimator_
pred = grid.predict(X_test)

### Model Selection: Feature Selection

Subset Selection

In [None]:
##### Best Subset Selection
import itertools 

def processSubset(feature_set):
    """Fit model on feature_set and calculate RSS""" 
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

def getBest(k):
    
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

models = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
for i in range(1,8):
    models.loc[i] = getBest(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

models # DF with best models for each k paramater
print(models.loc[2, "model"].summary()) # model summary

In [1]:
##### Forward Selection

def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in X.columns if p not in predictors]
    
    tic = time.time()
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the lowest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

models2 = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
predictors = []

for i in range(1,len(X.columns)+1):    
    models2.loc[i] = forward(predictors)
    predictors = models2.loc[i]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
###### Backward Selection


def backward(predictors):
    
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(combo))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the lowest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)-1, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

models3 = pd.DataFrame(columns=["RSS", "model"], index = range(1,len(X.columns)))

tic = time.time()
predictors = X.columns

while(len(predictors) > 1):  
    models3.loc[len(predictors)-1] = backward(predictors)
    predictors = models3.loc[len(predictors)-1]["model"].model.exog_names

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [2]:
##### Recursive Feature Elimination
# https://scikit-learn.org/stable/modules/feature_selection.html

### Prediction Evaluation

In [None]:
from sklearn.metrics import r2_score, classification_report, confusion_matrix

r2_score(y, predictions)
#Residual plot
MAE = metrics.mean_absolute_error(y_test, predictions)
MSE = metrics.mean_squared_error(y_test, predictions)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, predictions))
        

# Regression
#Residual plot
plt.scatter(X_test, y_test - predictions) # or y_train for training residuals

# Classification
classification_report(y_test, predictions)
confusion_matrix(y_test, predictions)

               
# F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0.
# F1 = 2 * (precision * recall) / (precision + recall)

![title](pred.png)

### Models

##### Linear Regression & Logistic Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Model coefficients
lm.coef_ 
lm.intercept_

##### Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

model.priors_      # P(Y = k): probability that a randomly selecting observation comes from kth class
model.means_       # u_k: mean of the gaussian model / observations for the kth class
model.coef_        # Coeff of linear combinations of predictors

##### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

##### Decision Trees and Random Forests

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Display Decision Trees

from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydot 

features = list(df.columns[1:]) # without target class

dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())  

##### SVCs and SVMs

In [None]:
from sklearn.svm import SVC

##### K-Means-Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans.cluster_centers_
kmeans.labels_

##### PCA

In [None]:
from sklearn.decomposition import PCA

x_pca = pca.transform(scaled_data) # Principal Components
pca.components_ # The components correspond to combinations of the original features, the components themselves 
# are stored as an attribute of the fitted PCA object:
# Use Heatmap to display components (onvert to DF)