In [153]:
#load toy set data
# from sklearn import datasets
# iris = datasets.load_iris()

In [156]:
#load data in dataframes
import pandas as pd
doc_path = "../feature_extraction/word2vec_LDA_sentiment.csv"
df=pd.read_csv(doc_path,index_col='docno')
pos_docs = df[df["class"] == 1]
neg_docs = df[df["class"] == -1]
neu_docs = df[df["class"] == 0]
print('neutral docs-->',neu_docs.shape)
print('positive docs-->',pos_docs.shape)
print('negative docs-->',neg_docs.shape)
print(pos_docs)

neutral docs--> (32594, 121)
positive docs--> (14283, 121)
negative docs--> (10637, 121)
                      0_x       1_x       2_x       3_x       4_x       5_x  \
docno                                                                         
 LA122989-0011   0.625603  0.386697  0.479026  0.412594  0.391509  0.458001   
 LA122989-0012   0.646113  0.436381  0.513598  0.389201  0.457487  0.522667   
 LA122989-0018   0.606507  0.552505  0.513218  0.526657  0.270074  0.364127   
 LA122989-0019   0.560057  0.674291  0.583621  0.480622  0.283424  0.271703   
 LA122989-0021   0.466378  0.637155  0.603542  0.431422  0.253742  0.312299   
...                   ...       ...       ...       ...       ...       ...   
FT911-3655       0.399649  0.504588  0.290757  0.565279  0.336671  0.493825   
FT911-3656       0.436370  0.505907  0.408461  0.497220  0.362383  0.384115   
FT911-3669       0.381024  0.454289  0.428082  0.466558  0.367593  0.462460   
FT911-3679       0.378951  0.463879  0.370

In [155]:
import math
import numpy as np
from scipy.spatial import distance
from sklearn.utils import shuffle

def k_means(num_clusters, df, init_centres, pos_docs, neg_docs, neu_docs, must_link_penalty, cannot_link_penalty):

    #returns a list of k initial centre points for cluster initialization
    def choose_initial_centres(num_clusters, df):
        num_docs = len(df)
        every_x_item = num_docs/num_clusters
        df_centres = df[::math.ceil(every_x_item)]
        return df_centres.iloc[:num_clusters]

    #one time initialization steps    
    #add new columns in dataframe that would contain distance values from the point to the centre
    # df1 = pd.DataFrame(columns=list(map(lambda x: "dist_c"+str(x), range(num_clusters))))
    # df = df.join(df1, how="outer")
    # print(df)

    #choose centre points from data if not already given
    if(len(init_centres)!=num_clusters):
        num_centres = num_clusters/3
        rem_centres = num_clusters%3
        pos_centers = choose_initial_centres(num_centres+rem_centres, pos_docs)
        neg_centers = choose_initial_centres(num_centres, neg_docs)
        neu_centres = choose_initial_centres(num_centres, neu_docs)
        centres = centres.append(pos_centers, neg_centers, neu_centres)
        print(centres)
        
    #determine distance of points from the centres and assign clusters
    # df = shuffle(df)
    data = df.to_numpy()
    centroids = centres.to_numpy()
    data = np.hstack([data, np.zeros((len(data),1)), np.ones((len(data),1))])
    #compute the centroids till the cluster assignment remains the same
    fit(data, centroids, num_clusters, pos_docs, neg_docs, must_link_penalty, cannot_link_penalty)

#takes in a dataframe and a center vector and outputs a series with distance values of all points from the vector
def fit(data, centroids, num_clusters, pos_docs, neg_docs, must_link_penalty, cannot_link_penalty):
    iter = 0
    print(centroids)
    while(not np.array_equiv(data[:,5],data[:,4])):
        if(iter!=0):
            data[:,4] = data[:,5]
            centroids = update_centroids(num_clusters, data)
        iter+=1
        dist = []
        for point in data:
            dist_val = []
            for index, center in centroids:
                eucledian_dist = distance.euclidean(point[0:4], center)
                penalty_dist = penalize(point, index, pos_docs, neg_docs, must_link_penalty, cannot_link_penalty)
                dist_val.append(eucledian_dist+penalty_dist)
            cluster_val = dist_val.index(min(dist_val))
            dist.append(cluster_val)
        data[:,5] = dist
    print(data)
    print('iterations--->',iter)


# penalize point for not being assigned to must link peers and being assigned to cannot link peers:
def penalize(point, assumed_pt_cluster, pos_docs, neg_docs, must_link_penalty, cannot_link_penalty):
    penalty = 0.0

    #return zero penalty for neutral sentiment documents
    if point['class'] == 0:
        return penalty
        
    elif point['class'] == 1:
        must_link_set = pos_docs
        cannot_link_set = neg_docs
    
    else:
        must_link_set = neg_docs
        cannot_link_set = pos_docs
    
    #return negative penalty for neutral sentiment documents
    for ml_pt in must_link_set:
        if ml_pt !=point and ml_pt[4]!= -1 and assumed_pt_cluster != ml_pt[4]:
            penalty += must_link_penalty

    #return positive penalty for  sentiment documents
    for cl_pt in cannot_link_set:
        if cl_pt !=point and cl_pt[4] != -1 and assumed_pt_cluster == cl_pt[4]:
            penalty += cannot_link_penalty

    return penalty

    #handles case where no point is assigned to a cluster center
    #takes in all data points assigned to individual clusters and asks for clustering again with new centroids
def update_centroids(num_clusters, data):
    #num_columns = len(df.shape[1])
    #make this dynamic: take as many columns as are there in the dataframe
    # clusters_with_no_pts = df[:4].nunique() - num_clusters
    #choose random samples as cluster centres
    # if (clusters_with_no_pts>0):
    #     centers = df.sample(n=clusters_with_no_pts)

    #find mean by cluster value and call eucledian distance again
    centroids = []
    c0 = data[data[:,4]== 0.0]
    c1 = data[data[:,4]== 1.0]
    c2 = data[data[:,4]== 2.0]

    center_0 = np.mean(c0[:,0:4], axis=0)
    center_1 = np.mean(c1[:,0:4], axis=0)
    center_2 = np.mean(c2[:,0:4], axis=0)

    print('center_0-->',center_0)
    print('center_1-->',center_1)
    print('center_2-->',center_2)

    # for i in range(num_clusters):
    #     centroids = center_0

    centroids = [center_0, center_1, center_2]
    return centroids

    # mask_list = []
    # for mask_no in range(num_clusters):
    #     mask_list.append(df['cluster'] == mask_no)

    # df_with_centroids = [] 
    # for mask in mask_list:
    #     df_with_centroids.append(df[mask])

    # centers_df = pd.DataFrame([])
    # for dataframe in df_with_centroids:
    #     centers_df.concat(dataframe.mean(axis=1))

    # print(centers_df)
    # return centers_df


#constraint filtering: post k-means

#constraints: must link [1,3,5, 4,2,5]
# k_means(5, df, [], must_link, cannot_link)
# assign initial centres
# assigns clusters to -1
# calculates cluster assignment
# prev_clusters = initial clusters
# while (calculated_clusters = prev_clusters)
#   calculate_new_centroids
#   calculate_eucledian_dist
#   penalize constraints
#   new cluster assignment
k_means(3, df, [])


SyntaxError: invalid syntax (2814763704.py, line 103)