In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, precision_recall_curve, recall_score, f1_score, auc, roc_auc_score, plot_roc_curve
from sklearn.model_selection import KFold
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from xgboost import XGBClassifier
sys.path.append('C:\\Users\\chery\\OneDrive\\Documents\\UW Bothell MSEE\\Predictive Learning\\BSEE_520_FP')
from utils.baseML import BaseML
from datetime import datetime

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance, silhouette_visualizer
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

In [None]:
spir_clust = BaseML("SPIR_manual_clean.csv")

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
spir_clust.df["census"] = spir_clust.df['census'].apply(lambda x: math.floor(x))
ax.scatter(spir_clust.df["event"], spir_clust.df["census"], spir_clust.df["time"])
fig.show()
fig.savefig("initial_data.png")

In [None]:
spir_clust.df.to_csv("SPIR_census_floor.csv")

In [None]:
spir_add = BaseML("SPIR_add.csv")
spir_add.df.drop("edu25", axis=1, inplace=True)
spir_add.df.dropna(axis=0, inplace=True)

In [None]:
scaler = preprocessing.StandardScaler().fit(spir_add.df.income)
df_scaled = scaler.transform(spir_add.df.income)
self.df = pd.DataFrame(df_scaled, columns = spir_add.df.income.columns, dtype= 'int64')

In [None]:
spir_add.normalize_features()

In [None]:
spir_add.df.to_csv("SPIR_normalized.csv")

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(spir_add.df)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
x_pca=pca.fit_transform(spir_add.df)

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance, silhouette_visualizer
clf = KMeans()

visualizer = KElbowVisualizer(clf, k=(4,12))

visualizer.fit(x_pca)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
clf = KMeans(n_clusters=6)
y_pred = clf.fit_predict(x_pca)

In [None]:
plt.scatter(x_pca[:,0], x_pca[:,1], c=y_pred)
plt.show()

In [None]:
from scipy import stats
spir_add_no_out = spir_add.df[(np.abs(stats.zscore(spir_add.df)) < 3).all(axis=1)]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(spir_add_no_out)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
x_pca_no_out=pca.fit_transform(spir_add_no_out)

In [None]:
clf_v = KMeans()

visualizer = KElbowVisualizer(clf, k=(4,12))

visualizer.fit(x_pca_no_out)        # Fit the data to the visualizer
visualizer.show()  

In [None]:
clf = KMeans(n_clusters=7)
y_pred_no_out = clf.fit_predict(x_pca_no_out)

In [None]:
plt.scatter(x_pca_no_out[:,0], x_pca_no_out[:,1], c=y_pred_no_out)
plt.show()

In [None]:
from sklearn.cluster import SpectralClustering
clf_a = SpectralClustering(n_clusters=7)
y_pred_agg = clf_a.fit_predict(x_pca_no_out)

In [None]:
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance, silhouette_visualizer
silhouette_visualizer(KMeans(7, random_state=42), x_pca_no_out, colors='yellowbrick')

In [None]:
spir_add_no_out.to_csv("SPIR_no_outliers.csv")

In [None]:
cluster_labels = clf.labels_
cl_label_df = pd.DataFrame(cluster_labels, columns = ["cluster_label"])

In [None]:

spir_with_labels = pd.concat([spir_add_no_out, cl_label_df], ignore_index=True, axis=1)

In [None]:
spir_add_no_out.sample(3)

In [None]:
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool

In [None]:
spir_add_no_out.shape[0]

In [None]:
filename = "plot_without_outliers.html"
palette =['aqua', 'aquamarine', 'azure', 'black', 'blue', 
        'brown', 'chartreuse', 'coral', 'crimson', 'cyan', 
        'darkblue', 'darkgreen', 'fuschsia', 'gold', 'indigo', 
        'lavender', 'lime', 'magenta', 'olive', 'orange', 
        'orangered', 'orchid', 'pink', 'red','salmon']#['red','green','blue','yellow']
colors =[]

for i in range(spir_add_no_out.shape[0]):
    colors.append(palette[cluster_labels[i]])

In [None]:
#cannot use seaborn palette for bokeh

#plot with boken
output_file(filename)
source = ColumnDataSource(
        data=dict(x=x_pca_no_out[:,0],y=x_pca_no_out[:,1],
            event = spir_add_no_out['event'],
            time = spir_add_no_out['time'], 
            census_tract = spir_add_no_out["census"],
            year = spir_add_no_out["year"],
            percent_white = spir_add_no_out["white"],
            income = spir_add_no_out["income"],
            colors=colors), )

#print(colors)
hover = HoverTool(tooltips=[
            ("event", "@event"),
            ("time", "@time"),
            ("census_tract", "@census_tract"),
            ("year", "@year"),
            ("percent_white", "@percent_white"),
            ("income", "@income")])

p = figure(plot_width=1000, plot_height=1000, tools=[hover],
            title="spir clustering")

p.circle(x='x',y= 'y', size=10, source=source,fill_color="colors")

show(p)

In [None]:
spir_add_no_out.to_csv("SPIR_cluster_labels.csv")