In [None]:
import os
import category_encoders as ce
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from config import AGE_DICT, SIZE_DICT, TARGET_COLS, BINARY_COLS


import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Clustering Models
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, auc
from sklearn.manifold import TSNE

# Classification models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
HEROKU_URL = os.getenv('HEROKU_POSTGRESQL_AMBER_URL')

uri = HEROKU_URL 
if uri.startswith("postgres://"):
    uri = uri.replace("postgres://", "postgresql://", 1)

In [None]:
# reading data
def read_data():
    df_raw = df_raw = pd.read_sql('petfinder_with_dates', uri)  
    return df_raw

In [None]:
def preprocess_data(df):

    # dropping irrelevant columns
    df = df.drop(columns=["id", "name", "organization_id", "published_at", "status_changed_at", "attribute_declawed", "color_tertiary", "good_with_cats", "good_with_children", "good_with_dogs", "breed_secondary", "color_secondary"])

    # transform "age" column 
    df['age'] = df['age'].map(AGE_DICT).astype(str).astype(int)

    # transform "size" column
    df['size'] = df['size'].map(SIZE_DICT).astype(str).astype(int)

    # dropping unknown values in gender
    df.drop(df[df['gender'] == 'Unknown'].index, inplace=True)

    # Convert binary columns to binary (0/1) data type
    df[BINARY_COLS] = df[BINARY_COLS].astype(bool).astype(int)

    # # Replace 'Male' and 'Female' with 0 and 1, respectively

    # # Replace 'Male' and 'Female' with 0 and 1, respectively, and convert to int
    # df['gender'] = df['gender'].replace({"Male": 0, "Female": 1}).astype(int)

    # Filter data for los 1+
    df = df[df['los'] >= 1]

    # target encoding on larger categorical features
    te = ce.TargetEncoder(cols=TARGET_COLS)
    df[TARGET_COLS] = te.fit_transform(df[TARGET_COLS], df["los"])

    return df


In [None]:
def fill_nan_mode(df, reference_column, feature):
    # Calculate the mode coat for each breed_primary
    mode_by_breed = df.groupby(reference_column)[feature].apply(lambda x: x.mode().iloc[0] if not x.isnull().all() else None)

    # Create a dictionary mapping each breed to its mode coat
    mode_dict = dict(mode_by_breed)

    # Fill the NaN values in 'coat' based on the breed using the mode_dict
    df[feature] = df.apply(lambda row: mode_dict[row[reference_column]] if pd.isna(row[feature]) and row[reference_column] in mode_dict else row[feature], axis=1)

    return df

In [None]:
# Dropping rows with null coat and color primary
def drop_null_rows(df, feature):
    df.dropna(subset=[feature], inplace=True)
    return df

In [None]:
# remove outliers
def remove_outliers(df, columns, zscore_threshold=3):
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        z_scores = np.abs((df[col] - mean) / std)
        df = df[z_scores <= zscore_threshold]
    return df

In [None]:
# # split data into training and testing data
# def split_data(X, y, test_size = .33, random_state=312):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
#     return X_train, X_test, y_train, y_test

In [None]:
# # Perform Randomized Search
# def perform_randomized_search(model, param_distributions, X_train, y_train, scoring='r2', cv=5):
#     random_search = RandomizedSearchCV(model, param_distributions=param_distributions, scoring=scoring, cv=cv, n_jobs=-1, random_state=0)
#     random_search.fit(X_train, y_train)
#     # best_model = random_search.best_estimator_
    
#     return random_search

In [None]:
# # train and eval models

# def train_eval_models(X_train, X_test, y_train, y_test, model):
#     # Create a pipeline to scale the features and initialize the model
#     pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', model)
#     ])

#     # Perform cross-validation with additional metrics
#     scores_r2 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
#     scores_mae = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
#     scores_mse = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

#     mean_score_r2 = scores_r2.mean()
#     mean_score_mae = -scores_mae.mean()
#     mean_score_mse = -scores_mse.mean()

#     # Model name and store results with each model
#     name = model.__class__.__name__
#     print('{} done. Mean R-squared (CV): {:.2f}, Mean MAE (CV): {:.2f}, Mean MSE (CV): {:.2f}'.format(
#         name, mean_score_r2, mean_score_mae, mean_score_mse))

#     # Train the best model on the entire training set and evaluate on the test set
#     pipeline.fit(X_train, y_train)
#     y_test_pred = pipeline.predict(X_test)

#     print('R-squared (test set): {:.2f}'.format(r2_score(y_test, y_test_pred)))
#     print('Mean squared error (test set): {:.2f}'.format(mean_squared_error(y_test, y_test_pred)))
#     print('Mean absolute error (test set): {:.2f}'.format(mean_absolute_error(y_test, y_test_pred)))




In [None]:
# Read data from the database
df_raw = read_data()

In [None]:
# Preprocess data
df_preprocessed = preprocess_data(df_raw)

In [None]:
# Remove outliers
outlier_columns = ['organization_name', 'los', 'breed_primary']
df_no_outliers = remove_outliers(df_preprocessed, outlier_columns)

In [None]:
df = fill_nan_mode(df_no_outliers, 'breed_primary', 'coat')
df = fill_nan_mode(df_no_outliers, 'breed_primary', 'color_primary')

In [None]:
df = drop_null_rows(df, 'coat')
df = drop_null_rows(df, 'color_primary')

In [None]:
# # Split data into training and testing sets
# X = df.drop('los', axis = 1)
# y = df['los']
# X_train, X_test, y_train, y_test = split_data(X, y)

In [None]:
# models_params = {
#     RandomForestRegressor: {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 10, 20, 30], 'random_state': [0]},
#     GradientBoostingRegressor: {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'random_state': [0]},
#     XGBRegressor: {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0], 'random_state': [0]},
#     LGBMRegressor: {'n_estimators': [10, 50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
# }

In [None]:
# # saves the best estimator found by randomized search for each model with key being the model name
# best_models = {}
# for model, params in models_params.items():
#     randomized_search = perform_randomized_search(model(), params, X_train, y_train)
#     best_model = randomized_search.best_estimator_
#     best_models[model.__name__] = best_model
#     train_eval_models(X_train, X_test, y_train, y_test, best_model)


In [None]:
# # Perform Stacking: defining the estimators it pulls models from the best_models dictionary
# estimators = [
#     ('ridge', Ridge()),
#     ('rf', best_models['RandomForestRegressor']),
#     ('gb', best_models['GradientBoostingRegressor']),
#     ('xgb', best_models['XGBRegressor']),
#     ('lgbm', best_models['LGBMRegressor'])
# ]

In [None]:
# # ridge is used to help with overfitting
# final_model = StackingRegressor(estimators=estimators, final_estimator=Ridge(), cv=5)
# _ = train_eval_models(X_train, X_test, y_train, y_test, final_model)

In [None]:
# # model training and validation on entire dataset
# final_model.fit(X, y)


In [None]:
# Rescale data using Standard Scaler for better clustering results
scaler = StandardScaler()
full_data = scaler.fit_transform(df)

In [None]:
# create a list to store the sum of squared distances for each k
ssd = []

# fit KMeans clustering with different values of k
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(full_data)
    ssd.append(kmeans.inertia_)

# create a dataframe with the k values and corresponding ssd
df_ssd = pd.DataFrame({'k': range(1, 11), 'ssd': ssd})

# create the line plot using matplotlib
plt.figure(figsize=(10, 6))
plt.plot(df_ssd['k'], df_ssd['ssd'], marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Distances')
plt.grid(True)
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Fit PCA without limiting the number of components to see all explained variances
pca = PCA()
pca.fit(full_data)

# Plotting the explained variance
explained_var = pca.explained_variance_ratio_.cumsum()
plt.figure(figsize=(10,6))
plt.plot(range(1, len(explained_var)+1), explained_var, marker='o', linestyle='--')
plt.title("Cumulative Explained Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.grid(True)
plt.show()

In [None]:
pca = PCA(n_components=14, random_state=42)
df_pca = pca.fit_transform(full_data)

In [None]:
kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(df_pca)
pred = kmeans.predict(df_pca)

In [None]:
# Determine the number of unique clusters
unique_clusters = np.unique(pred)

# Generate a colormap and pick colors for each cluster
colors = plt.cm.jet(np.linspace(0, 1, len(unique_clusters)))

for i, color in enumerate(colors):
    plt.scatter(df_pca[pred == i, 0], df_pca[pred == i, 1], s=50, c=[color], label=f'Cluster {i+1}')
    
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, marker='+', c='black', label='Centers')
plt.legend()
plt.show()


In [None]:
# Append cluster assignments to the dataframe
df['Cluster'] = pred

# Calculate mean values for each feature by cluster
cluster_means = df.groupby('Cluster').mean()
cluster_medians = df.groupby('Cluster').median()

print(cluster_means)
print(cluster_medians)


In [None]:
# Features and target
X = df.drop('Cluster', axis=1) 
y = df['Cluster']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
print(classification_report(y_test, y_pred))