In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import statistics 
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.ensemble import IsolationForest
from pyod.models.hbos import HBOS
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger().setLevel(logging.CRITICAL)
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering

ModuleNotFoundError: No module named 'pyod'

In [None]:
#loading data
# raw_data= pd.read_csv(r"C:\Users\Swati Gupta\Desktop\PD data.csv")
raw_data= pd.read_csv(r"PD data.csv")
print(raw_data.shape)

In [None]:
raw_data.head()

# To detect outliers by different methods

In [None]:
#z-score method
def outliers(raw_data,column):
    Average= np.mean(raw_data[column])
    Std_dev= np.std(raw_data[column])
    outliers_value= []
    for i in raw_data[column]:
        z_score= (i-Average)/Std_dev
        if z_score >3 or z_score <-3:
            outliers_value.append(i)
    distribution_plot = stats.norm.pdf(raw_data[column], Average, Std_dev) 
    pl.plot(raw_data[column],distribution_plot, color='coral')
    pl.show()
    return sorted(outliers_value)

In [None]:
print("PD Count outlier values: ",outliers(raw_data,'PD Count'))
print("PD Average outlier values: ",outliers(raw_data,'PD Average'))
print("Temperature outlier values: ",outliers(raw_data,'Temperature'))
print("Humidity outlier values: ",outliers(raw_data,'Humidity'))
print("Loading outlier values: ",outliers(raw_data,'Loading'))

In [None]:
#Discover outliers with boxplot

sns.boxplot(x=raw_data['PD Count'])


In [None]:
sns.boxplot(x=raw_data['PD Average'])


In [None]:
sns.boxplot(x=raw_data['Temperature'])


In [None]:
sns.boxplot(raw_data['Humidity'])


In [None]:
sns.boxplot(x=raw_data['Loading'])

In [None]:
def outliers_by_iqr(raw_data,column):
    sorted(raw_data)
    Q1, Q3= np.percentile(raw_data[column],[25,75])
    outliers_value= []
    for i in raw_data[column]:
        IQR= Q3 - Q1
        if (i < (Q1 - 1.5 * IQR)) |(i > (Q3 + 1.5 * IQR)):
            outliers_value.append(i)
    return sorted(outliers_value)

In [None]:
print("PD Average outlier values by IQR: ",outliers_by_iqr(raw_data,'PD Average'))
print("PD Count outlier values by IQR: ",outliers_by_iqr(raw_data,'PD Count'))
print("Temperature outlier values by IQR: ",outliers_by_iqr(raw_data,'Temperature'))
print("Humidity outlier values by IQR: ",outliers_by_iqr(raw_data,'Humidity'))
print("Loading outlier values by IQR: ",outliers_by_iqr(raw_data,'Loading'))


# RELATIONSHIP BETWEEN FEATURES

In [None]:
#TO CHECK THE RELATIONSHIP BETWEEN FEATURES
svm=sns.pairplot(raw_data, kind="scatter")
svm.savefig('image2.png', dpi=500)

In [None]:
def find_pearson_heat_map(raw_data):
    pearsoncorr = pd.DataFrame(raw_data).corr(method='pearson')
    plt.figure(figsize=(5, 5))
    sns.heatmap(pearsoncorr, 
                xticklabels=pearsoncorr.columns,
                yticklabels=pearsoncorr.columns,
                cmap='RdBu_r',
                annot=True,
                linewidth=1)
    
find_pearson_heat_map(raw_data)

In [None]:
# to find out the relationship BY KENDALL
x1= raw_data['Humidity']
x2= raw_data['Temperature']
kendal_ration,p_value= stats.kendalltau(x1,x2)
print(kendal_ration)

In [None]:
#Anova correleation()
from scipy.stats import f_oneway

def get_anova(raw_data, column):
    
    avg_data_map = {}
    base_col = 'Equipment ID'
    for index, row in  raw_data.iterrows():
        if row[base_col] in avg_data_map:
            avg_data_map[int(row[base_col])].append(row[column])

        else:
            avg_data_map[int(row[base_col])] = [row[column]]
    avg_data_lst = []
    for key in avg_data_map:
        avg_data_lst.append (avg_data_map[key])
    return f_oneway(*avg_data_lst)
    
print('score of PD-Average is: '+  str(get_anova(raw_data,'PD Average')))
print('score of PD-Count is: '+  str(get_anova(raw_data,'PD Count')))
print('score of Temperature is: '+  str(get_anova(raw_data,'Temperature')))
print('score of Humidity is: '+  str(get_anova(raw_data,'Humidity')))
print('score of Loding is: '+  str(get_anova(raw_data,'Loading')))

# Anomalies Detection

In [None]:
def get_scaled_value(raw_data):
    columns_needs_standardize= ['PD Average', 'Humidity', 'Temperature', 'Loading', 'PD Count']
    data= raw_data[columns_needs_standardize] 
    X = StandardScaler().fit_transform(data)
    return X

def plot_cluster(data, pred_clusters, column1, column2, title):
    data['clusters'] = pred_clusters
    clusters = {}
    for label in pred_clusters:
        if label not in clusters:
            clusters[label] = data[data.clusters == label]

    for key in clusters:
        plt.scatter(clusters[key][column1],clusters[key][column2])


    plt.title(title)
    plt.legend()
    plt.show()

    
def start_plotting(data, y_train):
    plot_cluster(data, y_train, 'PD Average', 'PD Count', 'PD Average with PD Count')  
    plot_cluster(data, y_train, 'PD Average', 'Temperature', 'PD Average with Temperature')
    plot_cluster(data, y_train, 'PD Average', 'Humidity', 'PD Average with Humidity')
    plot_cluster(data, y_train, 'PD Average', 'Loading', 'PD Average with Loading')
    plot_cluster(data, y_train, 'PD Count', 'Temperature', 'PD Count with Temperature')
    plot_cluster(data, y_train, 'PD Count', 'Humidity', 'PD Count with Humidity')
    plot_cluster(data, y_train, 'PD Count', 'Loading', 'PD Count with Loading')
    plot_cluster(data, y_train, 'Temperature', 'Loading', 'Temperature with Loading')
    plot_cluster(data, y_train, 'Temperature', 'Humidity', 'Temperature with Humidity')
    plot_cluster(data, y_train, 'Loading', 'Humidity', 'Loading with Humidity')
    
def k_means():
    k_means = KMeans(n_clusters=3, random_state=10,max_iter=1000) 
    X = get_scaled_value(raw_data)
    y_predict = k_means.fit_predict(X)
    start_plotting(raw_data, y_predict)
    
# def db_scan():
#     from sklearn.cluster import DBSCAN
#     X = get_scaled_value(raw_data)
#     clustering = DBSCAN().fit(X)
#     start_plotting(raw_data, clustering.labels_)
    
def isolation_forest():
    X = get_scaled_value(raw_data)
    clf = IsolationForest(random_state=24).fit(X)
    X = get_scaled_value(raw_data)
    y_train=clf.predict(X)
    start_plotting(raw_data, y_train)
    
def hbos():
    classifier = HBOS(contamination=0.05)
    X = get_scaled_value(raw_data)
    clf=classifier.fit(X)
    y_train=clf.predict(X)
    start_plotting(raw_data, y_train)

def agglomerative():
    from sklearn.cluster import AgglomerativeClustering
    X = get_scaled_value(raw_data)

    data_scaled = pd.DataFrame(X)
    cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
    y_train=cluster.fit_predict(data_scaled)
    data['clusters'] = y_train
    start_plotting(data, y_train)

In [None]:
k_means()  


In [None]:
isolation_forest()

In [None]:
agglomerative()