In [6]:
import pandas as pd
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MultiLabelBinarizer, Normalizer, RobustScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import *
import math
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn
import matplotlib.pyplot as plt

In [7]:
class correlation_matrix():

    def plot_correlation_heatmap(self, data_to_be_correlated):
        fig, ax = plt.subplots(1, 1, figsize=(15, 10))
        plt.rcParams['font.size'] = 10
        plt.figure(figsize = (16,10))

        seaborn.heatmap(data_to_be_correlated.corr(), ax=ax, linecolor='white', cmap="coolwarm", annot=True, fmt='.2f',
                        annot_kws={"size": 6}, linewidths=.02, cbar_kws={"orientation": "horizontal"})
        plt.show()

In [8]:
class kmeans_optimum_clusters_num():
    
    def kmeans_compute_metrics_for_every_cluster_number(self, clusters_range_lower_bound, clusters_range_upper_bound,
                                                        dataset,
                                                        distance_algorithm, print_optimum_metrics = True):

        kmeans_list_metrics = []

        for num_of_clusters in range(clusters_range_lower_bound, clusters_range_upper_bound):

            kmeans = KMeans(n_clusters= int(num_of_clusters), init='k-means++', n_init=10,  random_state=42, verbose=0)
            predictions = kmeans.fit_predict(dataset)
            centers = kmeans.cluster_centers_
            wcss = kmeans.inertia_
            num_jobs = kmeans.n_iter_
            silhouette = silhouette_score(dataset, predictions, distance_algorithm)

            kmeans_list_metrics.append({'clusters': num_of_clusters, 'silhouette': silhouette,
                                        'error': wcss, 'num_jobs': num_jobs})

            if print_optimum_metrics is True:
                print("For n_clusters = {}, silhouette score is {}, cluster_errors is {}, "
                      "n_jobs {})".format(num_of_clusters, silhouette, wcss, num_jobs))

In [9]:
class elbow_method_for_optimum_num_of_clusters():
    def elbow_method(self, clusters_range_lower_bound, clusters_range_upper_bound, dataset):

        wcss =[]

        for num_clusters in range(clusters_range_lower_bound, clusters_range_upper_bound):
            kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10, verbose=0, random_state=42)
            kmeans.fit(dataset)
            wcss.append(kmeans.inertia_)


        plt.plot(range(clusters_range_lower_bound, clusters_range_upper_bound), wcss)
        plt.title('Elbow Method for '+ str(clustering_algorithm) +' Clustering')
        plt.xlabel('No. of Cluster')
        plt.ylabel('wcss: sum of distances or clustering cost of sample to their closest cluster center')
        plt.show()

In [10]:
class pca():

    def __init__(self, dataset, final_variance_ratio, pca_plot_show=False):

        self.dataset = dataset
        self.final_variance_ratio = final_variance_ratio

        if pca_plot_show is True:
            self.pca_plot()
        else:
            pass    
    def pca_analysis(self):


        self.standard_scaler = StandardScaler()
        self.array_of_dataset = self.standard_scaler.fit_transform(self.dataset)

        self.pca = PCA()
        self.pca.fit(self.array_of_dataset)


        self.sum_of_variance_ratio = []

        for feature_ratio_variance in self.pca.explained_variance_ratio_:
            self.sum_of_variance_ratio.append(feature_ratio_variance)
            if sum(self.sum_of_variance_ratio) >= float(self.final_variance_ratio):
                break

        self.optimum_num_pca_components = len(self.sum_of_variance_ratio)

        self.pca_optimum = PCA(n_components=self.optimum_num_pca_components)
        self.pca_optimum_num_features_array = self.pca_optimum.fit_transform(
            self.array_of_dataset[:, : self.optimum_num_pca_components])

        return self.pca_optimum_num_features_array

    def pca_plot(self):

        self.independent_variables = self.dataset
        mean_vec = np.mean(self.independent_variables, axis=0)
        self.covariance_matrix = np.cov(self.independent_variables.T)
        self.eigenvalue_tuples, self.eigenvector_tuples = np.linalg.eig(self.covariance_matrix)
        self.eig_pairs = [(np.abs(self.eigenvalue_tuples[index_number]), self.eigenvector_tuples[:, index_number]) for
                          index_number in range(len(self.eigenvalue_tuples))]

        self.eig_pairs.sort(key=lambda x: x[0], reverse=True)

        self.sum_eigenvalues = sum(self.eigenvalue_tuples)
        self.individual_explained_variance = [(eigenvalue / self.sum_eigenvalues) * 100 for eigenvalue in
                                              sorted(self.eigenvalue_tuples, reverse=True)]
        self.cumulative_explained_variance = np.cumsum(self.individual_explained_variance)

        plt.figure(figsize=(7, 5))
        plt.bar(range(len(self.individual_explained_variance)), self.individual_explained_variance, alpha=0.3333,
                align='center', label='individual explained variance', color='g')
        plt.step(range(len(self.cumulative_explained_variance)), self.cumulative_explained_variance, where='mid',
                 label='cumulative explained variance')
        plt.ylabel('Explained variance ratio')
        plt.xlabel('Principal components')
        plt.legend(loc='best')
        plt.show()