In [1]:
# Standard Data Science Toolkit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import category_encoders as ce

# Machine Learning Preprocessing and Scoring Metrics
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, RepeatedKFold, KFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, mean_squared_error, r2_score, explained_variance_score, roc_curve, auc
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
import time
import psutil
import os

# Machine Learning Algorithms
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC, SVR
from tpot import TPOTClassifier

# Natural Language Processing
import string
from nltk.tokenize import word_tokenize
import nltk
import re



In [None]:
# Label Encoder - Changes Target column values to be in range [0, n)
# Instantiate the encoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [None]:
# Column Transformer
# Instantiate Column Transformer
col_transformer = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(categories='auto', handle_unknown='ignore'), ['category']),
    ('name_2', transformer(), ['columns_to_apply_to'])
], remainder='passthrough')


In [None]:
# Example of a BaggingClassifier
from sklearn.ensemble import BaggingClassifier
bagged_tree = BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), n_estimators=20)

In [None]:
# Feature Importances + Visualization
model = ''

features = model.feature_importances_

def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(features)

In [None]:
# Pipelines
# Create pipeline
pipe = Pipeline([
    ('preprocessing_step_1', step_1_action),
    ('preprocessing_step_2', step_2_action),
    ('clf', clf())
])

# Fit pipeline
pipe.fit(X_train, y_train)

# Print accuracy
pipe.score(X_test, y_test)

In [None]:
# More complex pipeline

def preprocess_data_with_pipeline(X):
    
    ### Encoding categorical data ###
    original_features_encoded = ColumnTransformer(transformers=[
        ("ohe", OneHotEncoder(categories="auto", handle_unknown="ignore"), ["category"])
    ], remainder="passthrough")
    
    ### Feature engineering ###
    def is_odd(data):
        """
        Helper function that returns 1 if odd, 0 if even
        """
        return data % 2

    feature_eng = ColumnTransformer(transformers=[
        ("add_number_odd", FunctionTransformer(is_odd), ["number"])
    ], remainder="drop")
  
    ### Combine encoded and engineered features ###
    feature_union = FeatureUnion(transformer_list=[
        ("encoded_features", original_features_encoded),
        ("engineered_features", feature_eng)
    ])
    
    ### Pipeline (including scaling) ###
    pipe = Pipeline(steps=[
        ("feature_union", feature_union),
        ("scale", StandardScaler())
    ])
    
    transformed_data = pipe.fit_transform(X)
    
    ### Re-apply labels (optional step for readability) ###
    encoder = original_features_encoded.named_transformers_["ohe"]
    category_labels = encoder.categories_[0]
    all_cols = list(category_labels) + ["number", "number_odd"]
    return pd.DataFrame(transformed_data, columns=all_cols, index=X.index), pipe
    
# Reset value of example_X
example_X = example_data.drop("target", axis=1)
# Test out our new function
result, pipe = preprocess_data_with_pipeline(example_X)
result

In [None]:
# Cross Validation
clf = ''
mean_clf_cv = cross_val_score(clf, X_train, y_train).mean()
print(f"Mean Cross Validation Score for Random Forest Classifier: {mean_clf_cv :.2%}")

In [None]:
# GridSearchCV
# Set hyperparamter grid
param_grid = {
    'hyperparameter_1': ['name_1', 'name_2'],
    'hyperparameter_2': [None, 1, 2, 3, 4, 5]
}

# Fit to model
gs = GridSearchCV(clf, param_grid, cv=3, return_train_score=True)
gs.fit(X_train, y_train)

# Evaluate training/testing score. Note train/test time also available
gs_train_score = gs.cv_results_['mean_train_score'].mean()
gs_test_score = gs.cv_results_['mean_test_score'].mean()
gs_best_params = gs.best_params_

print(f"Mean Training Score: {gs_train_score :.2%}")
print(f"Mean Test Score: {gs_test_score :.2%}")
print(f"Best Parameter Combination Found During Grid Search: {gs_best_params}")

In [None]:
# Recommender Systems
# Imports (Similar to sklearn but run different)
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

# Read in values as Surprise dataset 
reader = Reader(rating_scale=(0, 5))    #Customize to rating range
data = Dataset.load_from_df(df, reader)



In [None]:
# Natural Language Processing

def clean_string(list):
# Clean string of new line characters and punctuation then make lower case    
    one_string = ''.join(list).replace('\n', ' ')
    translator = str.maketrans('', '', string.punctuation)
    lyrics = one_string.translate(translator)
    return lyrics.lower()

cleaned_string = clean_string(___)
cleaned_string

# Tokenize
tokenized_string = word_tokenize(cleaned_string)

# Vectorize
def count_vectorize(tokenized_song):
    vector = {}
    for word in tokenized_song:
        if word in vector.keys():
            vector[word] += 1
        else:
            vector[word] = 1
     
    return vector