1. K-Means
2. Fuzzy C-means or EM
4. DBScan
5. Spectral

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')

%cd gdrive/Shareddrives/CSCI\ 5523\ -\ Data\ Mining\ Final\ Project
!ls


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/Shareddrives/CSCI 5523 - Data Mining Final Project
 Balance_deprecated.ipynb   classification.ipynb   data
'Bot Detection.gslides'     cluster_2.ipynb	   data.ipynb
 botDetection.mp4	    cluster.ipynb	   Proposal.gdoc


In [2]:
train_u = pd.read_json('data/more_human_train.json')

In [3]:
test_u = pd.read_json('data/more_human_test.json')

In [4]:
train_b = pd.read_json('data/bal_train.json')

In [5]:
test_b = pd.read_json('data/bal_test.json')

In [6]:
dataframes = [train_u, test_u, train_b, test_b]

for dataframe in dataframes:
  dataframe['bot'] = [1 if val=='bot' else 0 for val in dataframe['bot']]   # convert bot column from string to int (1 for bot, 0 for human)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def print_analysis(test, predictions, label):
  print("-----------------------------------")
  print('{} Accuracy score: {}'.format(label, accuracy_score(test, predictions)))
  print('{} Precision score: {}'.format(label, precision_score(test, predictions)))
  print('{} Recall score: {}'.format(label, recall_score(test, predictions)))
  print('{} F1 score: {}'.format(label, f1_score(test, predictions)))
  print("-----------------------------------\n")

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# UNBALANCED
x_train_u = train_u.to_numpy()[:,:-1].astype(int)   # all cols of train_u except for class label
x_train_u_scaled = scaler.fit_transform(x_train_u)

y_train_u = train_u.to_numpy()[:,-1].astype(int)    # only class label (last) column of train_u

x_test_u = test_u.to_numpy()[:,:-1].astype(int)   # all cols of test_u except for class label
x_test_u_scaled = scaler.transform(x_test_u)

y_test_u = test_u.to_numpy()[:,-1].astype(int)    # only class label (last) column of test_u

# BALANCED
x_train_b = train_b.to_numpy()[:,:-1].astype(int)   # all cols of train_b except for class label
x_train_b_scaled = scaler.fit_transform(x_train_b)

y_train_b = train_b.to_numpy()[:,-1].astype(int)    # only class label (last) column of train_b

x_test_b = test_b.to_numpy()[:,:-1].astype(int)   # all cols of test_b except for class label
x_test_b_scaled = scaler.transform(x_test_b)

y_test_b = test_b.to_numpy()[:,-1].astype(int)    # only class label (last) column of test_b

##Density Based Clustering: DBScan

Unlike K-Means, DBScan does not have a predict function as it is not intended for classified learning (no centroids).  We can still learn things from its cluster sizes and entropy (homogeneity) however.

###DBScan on unbalanced data

In [9]:
from sklearn import cluster
from sklearn.cluster import DBSCAN

data = pd.DataFrame(x_train_u_scaled, columns=train_u.columns[:-1])
db = DBSCAN(eps=0.005, min_samples=50).fit(data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = pd.DataFrame(db.labels_,columns=['Cluster ID'])
pd.DataFrame(labels, columns=['Cluster ID'])


Unnamed: 0,Cluster ID
0,-1
1,-1
2,-1
3,-1
4,-1
...,...
7794,-1
7795,-1
7796,-1
7797,-1


###Analysis

Compare cluster sizes with class sizes

In [10]:
np.unique(pd.DataFrame(labels, columns=['Cluster ID']), return_counts=True)

(array([-1,  0]), array([6499, 1300]))

In [11]:
np.unique(y_train_u, return_counts=True)

(array([0, 1]), array([6469, 1330]))

Find homogeneity completeness and v-measure scores (measurements of entropy). 1 is perfect clustering of labels, 0 is worst case.

In [12]:
from sklearn import metrics
# Evaluate on training data
np.unique(y_train_u, return_counts=True)
#np.shape(y_train_u)
#np.shape(labels)
metrics.homogeneity_completeness_v_measure(y_train_u, np.ndarray.flatten((labels.to_numpy())))

(0.5755017987637634, 0.5833405123464755, 0.579394643996971)

###DBScan on balanced data

In [13]:
from sklearn.cluster import DBSCAN

data = pd.DataFrame(x_train_b_scaled, columns=train_b.columns[:-1])
db = DBSCAN(eps=0.05, min_samples=50).fit(data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = pd.DataFrame(db.labels_,columns=['Cluster ID'])
pd.DataFrame(labels, columns=['Cluster ID'])




Unnamed: 0,Cluster ID
0,-1
1,0
2,0
3,0
4,-1
...,...
12942,2
12943,0
12944,0
12945,0


###Analysis

Compare cluster sizes with class sizes

In [14]:
np.unique(pd.DataFrame(labels, columns=['Cluster ID']), return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5]),
 array([4991, 6824,  407,  275,  179,  135,  136]))

In [15]:
np.unique(y_train_b, return_counts=True)

(array([0, 1]), array([6509, 6438]))

Find homogeneity completeness and v-measure scores (measurements of entropy). 1 is perfect clustering of labels, 0 is worst case.

In [16]:
from sklearn import metrics
# Evaluate on training data
np.unique(y_train_b, return_counts=True)
#np.shape(y_train_u)
#np.shape(labels)
metrics.homogeneity_completeness_v_measure(y_train_b, np.ndarray.flatten((labels.to_numpy())))


(0.6661943006476313, 0.43968063614706737, 0.5297393478421498)

##Spectral Clustering

###Spectral Clustering on unbalanced data
####Takes a very long time

Remove outlier

In [17]:
# train_u_new = train_u.drop(train_u.index[17007])
# x_train_u_new = train_u_new.to_numpy()[:,:-1].astype(int)
# x_train_u_new_scaled = scaler.fit_transform(x_train_u_new)
# y_train_u_new = train_u_new.to_numpy()[:,-1].astype(int)

# x_test_u_new_scaled = scaler.transform(x_test_u)

In [18]:
data = pd.DataFrame(x_train_u_scaled, columns=train_u.columns[:-1])

spectral = cluster.SpectralClustering(n_clusters=8,random_state=1,affinity='nearest_neighbors',gamma=100)
spectral.fit(data)
labels = pd.DataFrame(spectral.labels_,columns=['Cluster ID'])
#result = pd.concat((data1,labels1), axis=1)




###Analysis

Compare cluster sizes with class sizes

In [19]:
np.unique(pd.DataFrame(labels, columns=['Cluster ID']), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32),
 array([2310, 3882,   11, 1480,   27,   20,   22,   47]))

In [20]:
np.unique(y_train_u, return_counts=True)

(array([0, 1]), array([6469, 1330]))

Find homogeneity completeness and v-measure scores (measurements of entropy). 1 is perfect clustering of labels, 0 is worst case.

In [21]:
from sklearn import metrics
# Evaluate on training data
np.unique(y_train_u, return_counts=True)
#np.shape(y_train_u)
#np.shape(labels)
metrics.homogeneity_completeness_v_measure(y_train_u, np.ndarray.flatten((labels.to_numpy())))

(0.4439873823588762, 0.181938761215739, 0.25810877264359916)

###Spectral Clustering on balanced data
####Takes a very long time

Remove outlier

In [22]:
# train_b_new = train_b.drop(train_b.index[3798])
# x_train_b_new = train_b_new.to_numpy()[:,:-1].astype(int)
# x_train_b_new_scaled = scaler.fit_transform(x_train_b_new)
# y_train_b_new = train_b_new.to_numpy()[:,-1].astype(int)

# x_test_b_new_scaled = scaler.transform(x_test_b)

In [23]:
data = pd.DataFrame(x_train_b_scaled, columns=train_b.columns[:-1])

spectral = cluster.SpectralClustering(n_clusters=8,random_state=1,affinity='nearest_neighbors',gamma=100)
spectral.fit(data)
labels = pd.DataFrame(spectral.labels_,columns=['Cluster ID'])
#result = pd.concat((data1,labels1), axis=1)




###Analysis

Compare cluster sizes with class sizes

In [24]:
np.unique(pd.DataFrame(labels, columns=['Cluster ID']), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32),
 array([   47, 12282,    16,    43,   185,   104,    76,   194]))

In [25]:
np.unique(y_train_b, return_counts=True)

(array([0, 1]), array([6509, 6438]))

Find homogeneity completeness and v-measure scores (measurements of entropy). 1 is perfect clustering of labels, 0 is worst case.

In [26]:
from sklearn import metrics
# Evaluate on training data
np.unique(y_train_b, return_counts=True)
#np.shape(y_train_u)
#np.shape(labels)
metrics.homogeneity_completeness_v_measure(y_train_b, np.ndarray.flatten((labels.to_numpy())))

(0.053801518079819755, 0.12849950095181228, 0.07584673152602785)