<a href="https://colab.research.google.com/github/ekapti/CheatSheet/blob/master/Python_Cheatsheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Jupyter Notebook Presentations**

View -> Cell Toolbar -> Slideshow

On terminal go to the directory where the notebook is present and enter following code
jupyter nbconvert *.ipynb --to slides --post serve  

''*'' is your notebook name

You can use the generated html file or go to port 80000

**Python CheatSheet**



In [0]:
# Make the outcome as the first column to make it appear at the top of corr plot
cols = list(df)
cols.insert(0, cols.pop(cols.index('outcome')))
df = df.ix[:, cols]

In [0]:
# filter columns not in list
df[df.columns[~df.columns.isin(names)]]

**Categorical Feature Finder**

In [0]:
# This code finds categorical features of a dataset so that we can dummy encode them. 

def categorical_selector(df): 
    categorical_vars = []
    for i in range(0, df.shape[1]):
        var_name = list(df)[i]
        if ((df[var_name].nunique() <= 10) & (df[var_name].nunique() > 2)) :
            categorical_vars.append(var_name)
    return categorical_vars

                
categorical_vars = categorical_selector(df) 

# This is the binary version of it

def binary_var_finder(dataframe):
    binary_vars = []
    for i in range(0, dataframe.shape[1]):
        var_name = list(dataframe)[i]
        if (df[var_name].nunique() == 2):
            binary_vars.append(var_name)
    return binary_vars

**Random Forest Hyperparameter Tuning**

In [0]:
# Random Forest Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

**CAP CURVE BUILDER**

In [0]:

#The ‘capcurve’ function that builds and shows the CAP curve is defined as follows :

import matplotlib.pyplot as plt
from scipy import integrate
plt.switch_backend('agg')
from scipy import integrate

def capcurve(y_values, y_preds_proba):
    num_pos_obs = np.sum(y_values)
    num_count = len(y_values)
    rate_pos_obs = float(num_pos_obs) / float(num_count)
    ideal = pd.DataFrame({'x':[0,rate_pos_obs,1],'y':[0,1,1]})
    xx = np.arange(num_count) / float(num_count - 1)
    
    y_cap = np.c_[y_values,y_preds_proba]
    y_cap_df_s = pd.DataFrame(data=y_cap)
    y_cap_df_s = y_cap_df_s.sort_values([1], ascending=False).reset_index(level = y_cap_df_s.index.names, drop=True)
    
    print(y_cap_df_s.head(20))
    
    yy = np.cumsum(y_cap_df_s[0]) / float(num_pos_obs)
    yy = np.append([0], yy[0:num_count-1]) #add the first curve point (0,0) : for xx=0 we have yy=0
    
    percent = 0.5
    row_index = int(np.trunc(num_count * percent))
    
    val_y1 = yy[row_index]
    val_y2 = yy[row_index+1]
    if val_y1 == val_y2:
        val = val_y1*1.0
    else:
        val_x1 = xx[row_index]
        val_x2 = xx[row_index+1]
        val = val_y1 + ((val_x2 - percent)/(val_x2 - val_x1))*(val_y2 - val_y1)
    
    sigma_ideal = 1 * xx[num_pos_obs - 1 ] / 2 + (xx[num_count - 1] - xx[num_pos_obs]) * 1
    sigma_model = integrate.simps(yy,xx)
    sigma_random = integrate.simps(xx,xx)
    
    ar_value = (sigma_model - sigma_random) / (sigma_ideal - sigma_random)
    
    fig, ax = plt.subplots(nrows = 1, ncols = 1)
    ax.plot(ideal['x'],ideal['y'], color='grey', label='Perfect Model')
    ax.plot(xx,yy, color='red', label='User Model')
    ax.plot(xx,xx, color='blue', label='Random Model')
    ax.plot([percent, percent], [0.0, val], color='green', linestyle='--', linewidth=1)
    ax.plot([0, percent], [val, val], color='green', linestyle='--', linewidth=1, label=str(val*100)+'% of positive obs at '+str(percent*100)+'%')
    
    plt.xlim(0, 1.02)
    plt.ylim(0, 1.25)
    plt.title("CAP Curve - a_r value ="+str(ar_value))
    plt.xlabel('% of the data')
    plt.ylabel('% of positive obs')
    plt.legend()
    plt.show()
    
# And then call the function
from matplotlib import cm
y_pred_proba = rc.predict_proba(X=X_test)
capcurve(y_values=y_test, y_preds_proba=y_pred_proba[:,1])

**Create Cut Points for Buckets**

In [0]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'value': np.random.randint(1, 80, 20)}) 
df['group'] = pd.cut(df.value,
                     bins=[0, 5, 31, 51, 80],
                     labels=["very short", "short", "long", "very long"])


**Confusion Matrix Plot**

In [0]:
1.
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [0]:
2.
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

plt.figure(figsize=(10,10))

j = 1

for i in thresholds:
    y_test_predictions_high_recall = y_pred_proba[:, 1] > i
    
    plt.subplot(3, 3, j)
    j += 1
    
    # Compute confusion matrix
    cnf_matrix = metrics.confusion_matrix(y_test, y_test_predictions_high_recall)
    np.set_printoptions(precision=2)
    
    print('Accuracy of the testing data:', float(cnf_matrix[0,0] + cnf_matrix[1,1]) / float(cnf_matrix[0,1] + cnf_matrix[1,0] +
         cnf_matrix[0,0] + cnf_matrix[1,1]))
    print("Recall metric in the testing dataset @: ",i,'::', float(cnf_matrix[1,1]) / float((cnf_matrix[1,0] + cnf_matrix[1,1])))
    print("Precision metric in the testing dataset: ", float(cnf_matrix[1,1]) / float((cnf_matrix[1,1] + cnf_matrix[0,1])))
    print('___________________________________________________')
    print('Recall metric on 0 values:', float(cnf_matrix[0,0] /(cnf_matrix[0,0] + cnf_matrix[0,1])))
    print('Precision metric on 0 values:', float(cnf_matrix[0,0]/(cnf_matrix[0,0] + cnf_matrix[1,0])))
    # Plot non-normalized confusion matrix
    class_names = [0, 1]
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Threshold >= %s'%i) 


**Learning Curves Plot**

In [0]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

g = plot_learning_curve(logit,"Logit learning curves",X_train,y_train,cv=25)

**Precision Recall Curve**

In [0]:
From sklearn.metrics import precision_recall_curve

Precisions, recalls, thresholds = precision_recall_curve(y_train, y_predict)


Def plot_precision_recall_vs_thresholds(precisions, recalls, thresholds):
	Plt.plot(thresholds, precisions[:-1], "b--", label = 'Precisions)
	Plt.plot(thresholds, recalls[:-1], "g-", label = 'Recalls')
	Plt.xlabe('Threshold')
	Plt.legend(loc = 'upper left')
	Plt.ylim([0,1])
	
Plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
Plt.show()

**Outlier Detector**

In [0]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observationsb containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers  

**Recursive Feature Elimination**

In [0]:
# Pick the most important feature names
ranked_features = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), X_train.columns[1:]))
filtered_features = filter(lambda t: t[0]==1, ranked_features)
feature_columns = [features[1] for features in ranked_features if features[0] < 11] # This value determines most important values

**Permutation Importance ELI5**

In [0]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(logit, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist(), top = None)