### Quick Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Inspection

In [None]:
# first 10 rows and last 10 rows for inspection
data.head(10)
data.tail(10)

# column names
data.columns

# column data types
data.dtypes

# type casting if necessary 
data['col'] = data['col'].astype(str)
data['col'] = data['col'].astype(int)
data['col'] = data['col'].astype(float)

# num rows, num cols, num null and non-null, variable names
data.info()

# distribution of continuous
data.describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

# data cardinality
data.nunique()

# feature value counts
data['col'].value_counts()

# check null
data.isnull().sum()
data.isnull().mean() 

# check or drop duplicates
data[data.duplicated()].shape

### EDA Graphing

In [None]:
# pairwise correlation plots 
sns.heatmap(data.corr(), square=True, annot=True)

# visualize nulls 
sns.heatmap(data.sample(1000).isnull())

# pairwise comparison (and histogram when row = col)
pd.plotting.scatter_matrix(data, figsize=(15, 10))

# scatterplot with semi transparency for many dense data points
sns.scatterplot(x='col1',y='col2', data=data, alpha = 0.3)

# hexplots 
sns.jointplot(x='col1', y='col2', kind='hex')

# boxplots
sns.boxplot(x='col1', y='col2', data=data)

# countplots
sns.countplot(x='col1', y='col2', data=data)

# historgram of features by class in classification 
sns.histplot(data=data, x='col', hue='class')

# density plot (similar to histogram but smoother)
data.plot.density()


### Data Cleaning 

In [None]:
# imputation
data = data.fillna(data.mean()) 
data = data.fillna(data.mode()) 
data.loc[(data.col > 100), 'col'] = 100 # generic replace

# dropping duplicates and null columns
data = data.drop_duplicates()
data = data.dropna()


# outlier clipping: by percentile 
def percentile_clip(x, low_bound, high_bound): 
    low_clip = x.quantile(low_bound)
    high_clip = x.quantile(high_bound)

    return x.clip(low_clip, high_clip)


# outlier clipping: +/- 3 stdevs from mean
def three_std_clip(x): 
    mean = np.mean(x)
    std = np.std(x)

    low_clip = mean - 3*std
    high_clip = mean + 3*std

    return x.clip(low_clip, high_clip)
  
    
# outlier clipping: +/- 1.5 IQRs
def IQR_clip(x): 
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    
    IQR = q3-q1
    low_clip = q1-(1.5*IQR)
    high_clip = q3+(1.5*IQR)

    return x.clip(low_clip, high_clip)

### Data Transformation

In [None]:
from sklearn.preprocessing import PolynomialFeatures, Binarizer, StandardScaler, MinMaxScaler, OneHotEncoder

# feature indexing based on types
continuous = data.select_dtypes(include=[np.number]).columns # all continuous columns
categorical = data.select_dtypes(exclude=[np.number]).columns # all categorical columns

# generate up to n degree of polynomial terms and interaction terms
data[continuous] = PolynomialFeatures(degree=3, include_bias=False).fit_transform(features)

# standardization and normalization
data[continuous] = StandardScaler().fit_transform(data[continuous])
data[continious] = MinMaxScalar().fit_transform(data[continuous])

# binarization (threshold is what to use to binarize)
data[continuous] = Binarizer(threshold=1).fit_transformer()

# one hot encoding 
data[categorical] = OneHotEncoder(drop='first').fit_transform(data[categorical])

# normalization, standardization, clipping
data = (data-data.mean())/data.std() 
data = (data-data.min())/(data.max()-data.min())
data = data.clip(min, max)

# alternate one hot encoding with drop_first (# likely want to include "drop_first=True" https://towardsdatascience.com/beware-of-the-dummy-variable-trap-in-pandas-727e8e6b8bde)
pd.get_dummies(df, drop_first=True)

# binarization
data[label] = (data[label] == 'category') * 1.0

# sampling:
data.sample(1000) # down-sample
data.sample(20000, replace=True) # up-sample


### Model Fitting 

In [None]:
# Imports 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from xgboost import XGBRegressor, XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


# Instantiate
linreg = LinearRegression()
logreg = LogisticRegression()
lasso = Lasso(alpha=0.5) # alpha=0 is OLS
ridge = Ridge(alpha=0.5) # alpha=0 is OLS 
en = ElasticNet(alpha=0.5, l1_ratio=1) # alpha=0 is OLS, l1_ratio=0 is Ridge 
xgbr = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xgbc = XGBClassifier(n_estimators=100, max_depth=10,, eta=0.1, reg_alpha=0, reg_lambda=1, colsample_bylevel=1, colsample_bytree=1, gamma=0)
dtc = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=1)
rfr = RandomForestRegressor(n_estimators=100, max_depth=10)
rfc = RandomForestClassifier(n_estimators=100, max_depth=10)
gbc = GradientBoostingClassifier(loss='log_loss', learning_rate=.1, n_estimators=100, max_depth=10, sub_sample=.8)
kn = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto')


In [None]:
# simple grid search 
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [200, 300, 400],
    'max_depth' : [3, 5, 7]
}

grid_search = sklearn.model_selection.GridSearchCV(estimator=rfc, 
                                                   param_grid=param_grid, 
                                                   cv= 5, 
                                                   verbose=2)
grid_search.fit(X, y)

# display best parameters and best score
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# build model with best paremeters
best_rfc = RandomForestClassifier(**grid_search.best_params_).fit(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 2)

# training and predictions
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, plot_roc_curve

# regression 
r2_score(y_test, y_pred)
mean_squared_error(y_test, y_pred)
mean_absolute_error(y_test, y_pred)

# classification
accuracy_score(y_test, y_pred)
precision_score(y_test, y_pred)
recall_score(y_test, y_pred)
roc_auc_score(y_test, y_pred)
f1_score(y_test, y_pred)
plot_roc_curve(model, X_test, y_test) 

In [None]:
# linear / logistic regression pull coefficients 
import statsmodels.api as sm

X = add_constant(X)                        # add intercept 
model = sm.OLS(y_train, X_train).fit()     # linear regression
model = sm.Logit(y_train, X_train).fit()   # logistic regression
model.predict(X_test)

print(model.summary())               # show summary 
p_vals = pd.DataFrame(model.pvalues) # pull p-values 
coefs = pd.DataFrame(model.params)   # pull coefficients 

In [None]:
# random forest pull feature importance 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
rf.fit(X, y)

feat_importance_dict = {'feature': X.columns, 
                        'importance': rf.feature_importances_}

pd.DataFrame(feat_importance_dict)

### Unsupervised Learning

In [None]:
from sklearn.decomposition import PCA

# PCA
pca = PCA(n_components=2).fit(X)
X_pca = pca.fit_transform(X)

# PCA plot
plt.figure(figsize=(20,15))
plt.scatter(X_pca[:,0], X_pca[:,1])
plt.show()

# PCA plot with label
plt.figure(figsize=(20,15))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y)
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Elbow Method
num_clusters_to_try = 20
inertias = []
for clusters in range(1,num_clusters_to_try):
  kmeans = KMeans(n_clusters=clusters).fit(X_pca)
  inertias.append(kmeans.inertia_)

plt.plot(np.arange(1, num_clusters_to_try), inertias)

In [None]:
# K-Means
kmeans = KMeans(n_clusters=4, random_state=1).fit(X_pca)
labels = kmeans.predict(X_pca)

plt.scatter(X_pca[:,0], X_pca[:,1], c=labels)