In [8]:
from __future__ import absolute_import, division, print_function

%matplotlib inline
import matplotlib.pyplot as plt

In [9]:
plt.rcParams['figure.figsize'] = 6, 5  # plotsize 
plt.rcParams['font.size'] = 14
plt.rcParams['image.cmap'] = 'viridis'

In [10]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
from pandas.tools.plotting import parallel_coordinates

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.cluster import AgglomerativeClustering, MeanShift, KMeans, DBSCAN
from sklearn.svm import SVC

In [12]:
import sqlalchemy

# Dataframes / input

In [None]:
# make new empty df
df = pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)


Reading in data

In [50]:
df = pd.read_csv('breast-cancer-wisconsin.data', names = ['B','C','D','E','F','G','H','I','J','K'],index_col = 0)
# can also use read_sql, read_json, read_html, read_excel, read_table
# can include parse_dates = [cols] and infer_datetime_format = True to read in dates
# can specify which columns with usecols = []
# can flag comments using comment = '#' for comments starting with #
# can specify encoding, encoding = ''
print(df.shape)
df.head(1)

(699, 10)


Unnamed: 0,B,C,D,E,F,G,H,I,J,K
1000025,5,1,1,1,2,1,3,1,1,2


In [48]:
from StringIO import StringIO
Data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
df = pd.read_csv(StringIO(Data), dtype=object, usecols = [cols])
df


Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### SQL in pandas

In [None]:
engine = create_engine('sqlite:///:memory:', echo=False) # set echo to True to see output
con = engine.connect()
result = con.execute(query)

df = pd.read_sql(sql, con)
# sql is string SQL query or database table name
# con is SQLAlchemy connectable

# Data cleaning

In [None]:
# print column names, # non-nulls, and dtypes
df.info()

# return number of non-NaN entries
non_nans = df.count()

# remove rows or columns with NaN
df.dropna(axis=0 or 1, how='any' or 'all', thresh=None or int, subset=None (rows or cols in non-axis dim), inplace=False)

# return the number of counts for each of the values in a column
df['B'].value_counts()

# filling nans
df.fillna('replacement')
df.fillna(method = 'pad'/'bfill')
df.fillna(df.mean())
df.interpolate(method = 'time'/'values') #best for time series

# return the indices for "True" values from the statement within the brackets, can apply any function, lambda function
A = df.index[df['Bare_Nuclei'].apply(np.isnan)]

# replace values in a column with specified values
df['B'] = df['B'].replace('?',np.nan)

# can also create a map (dict) to replace multiple values with new inputs
map_dict = {2: 0, 4: 1}
Y = df['K'].map(map_dict)

# change the data type of a column
df['B'] = df['B'] .astype('float')

# drop rows
X1 = df.drop('C',axis=1)
# drop columns
X2= df.drop(1,axis=0)

# get column names as a list
df.columns

# choose n random samples
df.sample(n)

# remove outliers
df[np.abs(df.Data-df.Data.mean())<=(3*df.Data.std())] 

### Joining tables

In [None]:
# concatenate
pd.concat([df1,df2], axis=0, join='outer'/'inner', join_axes=[df1.index/df2.index], ignore_index=False, 
          keys=[hierarchical indexing])
df1.append(df2, ignore_index=False/True) # adds rows of df2 to the bottom of df1

# merge
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)
# how - inner, outer, left, right
# on - cos to join on, must be in both dfs
# left_on / right_on - cols from df_left / df_right to use as keys
# left_index / right_index - if True, use as keys
# sort - sort by join keys (set to False to speed up)
pd.merge_ordered(left, right, fill_method='ffill', left_by='s')

# join - on index as keys
result = left.join(right,how='inner'/'outer', on='key')

### Geolocations

In [None]:
# To geolocate a query to an address and coordinate
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode('27240 Altamont, Los Altos Hills')
location.address
location.latitude, location.longitude

# to get an address from coordinates
location = geolocator.reverse("52.509669, 13.376294")

# measuring distances
from geopy.distance import vincenty
location1 = (41.49008, -71.312796)
location2 = (41.456, -72.3542)
distance = vincenty(location1, location2).meters

### Dates and timestamps

In [None]:
# convert to DatetimeIndex:
pd.to_datetime(*args,**kwargs)
    # can assemble from multiple columns, or from a series of strings, or tuples
    # errors = 'coerce' to set invalid values as NaT
    # format = '%d/%m/%Y'
    # infer_datetime_format = True to speed up

# make index with date ranges starting at start date and ending at end date or with P periods and frequency F
index = pd.date_range('2000-1-1', periods=1000, freq='M')
# for business days
index = pd.bdate_range('2012-1-1', periods=250)

pd.Period('2012-01-10', freq = 'D')
# time span. specify the frequency / interval time

pd.period_range()

# indexing with datetime
ts['10/31/2011':'12/31/2011'] # returns all rows between these dates
# can also pass in the year or month as strings:
ts['2011']
ts['2011-6']
dft[datetime(2013, 1, 1):datetime(2013,2,28)]

# convert timestamps to period
ts.to_period()

# Data exploration

In [None]:
# output a table of basic stats for each numeric column
df.describe()

# plot histogram of each column - if performed on grouped df, will automatically separate groups
df[col].plot.hist(alpha = 0) # useful for comparing multiple groups on same axes
df.hist() # plots hist for every column

# plot bar chart of value counts
df['col_name'].value_counts()[:10].plot(kind = 'bar')

# plot cross-plots
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df[], alpha = 1, diagonal = 'kde'/'hist')

# parallel coordinates plot
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(df[],'K')

# hexbin plot for dense data
plt.hexbin(a,b, gridsize = 50, cmap = 'viridis', C = df['col_name'], vmin = 1750, vmax = 1800,
            reduce_C_function = np.min)
plt.colorbar()

# stacked area plot
df.plot.area();

# boxplot
 bp = df_box.boxplot(by='g') # grouped by col g
    
# get index of maximum/minimum value
df.idxmax(axis=0)
df.idxmin(axis=0)

In [None]:
# function for plotting correlation heatmap
def plot_corr_heatmap(corrs, labels, cmap='viridis'):
    heatmap = plt.matshow(corrs, cmap=cmap, interpolation='nearest')
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.colorbar(label='Correlation', fraction=0.046, shrink=1.0)
    plt.grid(0)

In [None]:
corrs = df.corr(method='pearson')
plot_corr_heatmap(corrs, df.columns.values, cmap='viridis')

### Group-by in pandas

In [None]:
df.groupby('A') # splits on rows
df.groupby(['A','B'])
# can group by a user-defined function
grouped = df.groupby(function,axis=1)
# based on hierarchical indexing
df.groupby(level = 0)

# aggregate functions on grouped data
grouped.first(), grouped.last(), grouped.sum()

grouped.groups # returns a dictionary of groups

# iterate through groups
for name, group in df.groupby('A'):
    print(name)
    print(group)
    
# select a group
grouped.get_group(label)

grouped.size()
grouped.describe()
grouped.count()

# apply a function to each group
zscores = grouped.transform(lambda x: (x-x.mean())/x.std())
imputed = grouped.transform(lambda x: x.fillna(x.mean()))

# filter data based on group properties
df.groupby('A').filter(lambda x: x.sum() > 2) # must return True or False when applied to group

# Modeling

## One-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit_transform(X).toarray()

## Test / train split

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Dimensionality reduction

In [None]:
from sklearn.decomposition import PCA
PCA(n_components=None, whiten=False, svd_solver='auto', tol=0.0)

## Feature normalization

In [None]:
# set to zero mean and unit variance
from sklearn.preprocessing import StandardScaler
scale = StandardScaler(copy=True, with_mean=True, with_std=True)
trainX = scale.fit_transform(trainX)
testX = scale.transform(testX)
# other methods include inverse_transform(X), partial_fit(X), get_params()

# can also use RobustSaler, which is roubst to outliers

## Machine Learning algorithms

In [None]:
# general scikit-learn methods
model = Model(*args)
model.fit(X)
model.fit_predict(X)
model.get_params()
model.predict(X)
model.score(X)

### Clustering

In [None]:
# k-means
from sklearn.cluster import KMeans
KMeans(n_clusters = 8, tol=0.0001)

# mean shift
from sklearn.cluster import MeanShift
MeanShift(bandwidth = None, seeds = None, cluster_all = True)

# agglomerative clustering
from sklearn.cluster import AgglomerativeClustering
AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean'/'l1'/'l2'/'cosine', 
                        linkage = 'ward'/'average')

# DBSCAN
from sklearn.cluster import DBSCAN
DBSCAN(eps=0.5, min_samples = 5)

### Classification

In [None]:
# kNN
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors = 5, weights='uniform'/'distance')

# SVM
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf'/'linear'/'poly',probability=True, class_weight = 'balanced')

# Logistic Regression
from sklenar.linear_model import LogisticRegression
LR = LogisticRegression(penalty = 'l1'/'l2', multi_class='ovr'/'multinomial')

# Random Forest
from sklearn.ensemble import RandomForestClassifer
RF = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, oob_score=True)

# Boosted Trees
from sklearn.ensemble import GradientBoostingClassifier
BT = GradientBoostingClassifier(loss = 'deviance'/'exponential', learning_rate = 0.1,
                                n_estimators = 100,max_depth = 3, min_samples_split = 2)
# deviance - logistic regression, exponential - adaboost

### Regression

In [None]:
from sklearn.linear_model import
LinearRegression()
LassoCV()
RidgeCV()
ElasticNetCV(l1_ratio = 0.5, cv = n_folds, normalize = True/False)

from sklearn.ensemble import
RandomForestRegressor(n_estimators=10, max_depth=None,min_samples_split=2, oob_score=True)
GradientBoostingRegressor(loss='ls'/'lad'/'huber',learning_rate = 0.1, n_estimators=100,min_samples_split=2,max_depth=3)
AdaBoostRegressor(base_estimator=DecisionTreeRegressor, n_estimators=50, learning_rate=1.0,
                 loss='linear'/'square'/'exponential')

## Model selection and evaluation

### Hyper-parameter optimizers

In [None]:
from sklenar.model_selection import GridSearchCV
# param_grid is a dict with param names as keys and lists of param settings as values
gsearch = GridSearchCV(estimator, param_grid, cv = n_folds)
gsearch.cv_results_
gsearch.best_estimator_
gsearch.best_score_
gsearch.best_params_

### Metrics

In [None]:
# classification
sklearn.metrics.accuracy_score(y_true,y_pred)
sklearn.metrics.auc(x,y) # computes area under any curve
sklearn.metrics.confusion_matrix(y_true,y_pred)
sklearn.metrics.precision_score(y_true,y_pred)
sklearn.metrics.recall_score(y_true,y_pred)

# regression
sklearn.metrics.explained_variance_score(y_true, y_pred)
sklearn.metrics.mean_absolute_error(y_true, y_pred)
sklearn.metrics.mean_squared_error(y_true, y_pred)
sklearn.metrics.r2_score(y_true, y_pred)

In [None]:
def classification_stats(model, X_test, y_test):
    predicted = model.predict(X_test)
    probabilities = model.predict_proba(X_test)
    classes = [int(v) for v in list(set(y_test))]
    
    # Get Accuracy and ROC AUC results for each class individually
    start = 0
    if probabilities.shape[1] == 2:
        start = 1
    for i in range(start, probabilities.shape[1]):
        probs = probabilities[:,i]
        current_class = classes[i]
        y_test_i = [1 if current_class == int(v) else 0 for v in y_test]
        predicted_i = [1 if current_class == int(v) else 0 for v in predicted]
        print('Class {}'.format(current_class))
        print('Accuracy: {:0.2f}'.format(sklearn.metrics.accuracy_score(y_test_i, predicted_i)))
        print('ROC AUC Score: {:0.2f}'.format(sklearn.metrics.roc_auc_score(y_test_i, probs)))
        print()
        
    print('Confusion Matrix')

    # Print out confusion matrix legend  if only 2 classes
    if len(classes) == 2:
        print('True Negative (Guess 0, Actual 0)  | False Positive (Guess 1, Actual 0)')
        print('-----------------------------------------------------------------------')
        print('False Negative (Guess 0, Actual 1) |  True Positive (Guess 1, Actual 1)')
        print()

    print(sklearn.metrics.confusion_matrix(y_test, predicted))
    print()
    print('Classification Report')
    print(sklearn.metrics.classification_report(y_test, predicted))

In [None]:
def plot_roc_auc(model, X_test, y_test):
    predicted = model.predict(X_test)
    probabilities = model.predict_proba(X_test)
    classes = list(set(y_test))
    
    # Get ROC curve for each class individually
    start = 0
    if probabilities.shape[1] == 2:
        start = 1
    for i in range(start, probabilities.shape[1]):
        probs = probabilities[:,i]
        current_class = classes[i]
        y_test_i = [1 if current_class == int(v) else 0 for v in y_test]
    
        false_positive_rate, true_positive_rate, thresholds = sklearn.metrics.roc_curve(y_test_i, probs)
        roc_auc = sklearn.metrics.auc(false_positive_rate, true_positive_rate)

        plt.plot(false_positive_rate, true_positive_rate,
            label='{} - AUC = {:0.2f}'.format(current_class, roc_auc))

    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    return plt

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X, y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        # scoring='neg_mean_squared_error'
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt