# Clustering Baselines using Original Features

In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os
os.chdir('../')
from clustering_evaluation import ClusterPurity
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the Dataset (AI4BD)

In [73]:
# -- load the input dataset--#
input_df=pd.read_csv('/home/daikiri/DAIKIRI/src/Hamada/merged.csv', low_memory=False)
input_df.index = 'Event_' + input_df.index.astype(str)

num_rows, num_cols = input_df.shape  # at max num_rows times num_cols columns.
column_names = input_df.columns
    
print ('Input data loaded with shape', input_df.shape) 
    

Input data loaded with shape (2974716, 41)


In [3]:
column_names

Index(['customer_id', 'customer_name', 'site_id', 'customer_number',
       'op_group_id', 'logistic_type', 'location_uuid', 'assortment_uuid',
       'customer_item_number', 'supplier_id', 'supplier_item_number',
       'box_number_in_site', 'replenished_at', 'old_stock', 'new_stock',
       'type', 'order_number', 'site_number', 'order_pos', 'ordering_at',
       'order_priority', 'ordered_qty', 'delivery_qty', 'requesting_at',
       'shipping_at', 'confirmed_at', 'confirming_at', 'physical_address',
       'no_of_boxes', 'reorder_quantity', 'reorder_point', 'deliver_mon',
       'deliver_tue', 'deliver_wed', 'deliver_thu', 'deliver_fri',
       'start_week', 'every_week', 'week_of_month', 'positive_stock_change',
       'relative_class'],
      dtype='object')

In [74]:
feature_columns=['op_group_id', 'logistic_type',
                 'customer_item_number','supplier_id', 'supplier_item_number', 'box_number_in_site', 'replenished_at', 'old_stock', 'new_stock', 'type']

sub_inputDF=input_df[feature_columns]
sub_inputDF=sub_inputDF.dropna()

lb=LabelBinarizer()
event_labels=lb.fit_transform(sub_inputDF['type'].tolist())
lb.classes_

sub_inputDF.shape

(2881076, 10)

In [76]:
sub_inputDF=sub_inputDF[feature_columns[:-1]] # remove labels from features

In [77]:
train_as_dicts = [dict(r.iteritems()) for _, r in sub_inputDF.iterrows()]

In [78]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer(sparse=True)
vectorized_sparse = vectorizer.fit_transform(train_as_dicts)

In [82]:
event_labels.shape

(2881076, 1)

In [80]:

# split the dataset into train-test split (80%-20%)
X_train, X_test, y_train, y_test = train_test_split( vectorized_sparse, event_labels, test_size=0.33, random_state=42)

# Baseline_1: Logistic Regression

In [84]:
logistic_clf = LogisticRegression(random_state=42).fit(X_train.toarray(), y_train)
y_predicted = logistic_clf.predict(X_test)

#----------- Evaluation based on Precision, Recall, Accuracy and F1-score: -------#
accuracy = accuracy_score(y_test, y_predicted)
print('Accuracy: %f' % accuracy)

precision = precision_score(y_test, y_predicted)
print('Precision: %f' % precision)

recall = recall_score(y_test, y_predicted)
print('Recall: %f' % recall)

f1 = f1_score(y_test, y_predicted)
print('F1 score: %f' % f1)

#------------- Evaluation based on cluster_purity metric: -----------------#
evaluator=ClusterPurity()
purity_score=evaluator.purity_score(y_true=y_test, y_pred=y_predicted)

print('Clustering Purity Score: ', purity_score)

MemoryError: Unable to allocate 1.86 TiB for an array with shape (1930320, 132335) and data type float64

In [83]:
X_test

<950756x132335 sparse matrix of type '<class 'numpy.float64'>'
	with 8556804 stored elements in Compressed Sparse Row format>