In [1]:
import json
import sys

In [2]:
# Import the tweet files
with open("./tweets.json", "r") as read_file:
    data = json.load(read_file)

with open("./initialSeeds.txt") as s:
    seeds = list(s)

In [3]:
# Put seeds into list of ints
seeds = list(map(str.rstrip, seeds))
seeds = [seed.replace(',', '') for seed in seeds]
seeds = list(map(int, seeds))


In [4]:
# Turn tweet text string into list of words
for tweet in data:

    # isolate text, make all lowercase, and split on whitespace
    tweet['text'] = tweet['text'].lower().split()
    
    # making the decision to leave in hashtags and handles
    # and remove retweet from 0th position
    if tweet['text'][0] == 'rt':
        tweet['text'] = tweet['text'][1:]

In [5]:
# Helper functions to calculate jaccard distance

def intersect(a, b):
    intersection = [value for value in a if value in b] 
    return len(intersection)

def union(a, b): 
    union = list(set(a) | set(b)) 
    return len(union)

def jaccard(a, b):
    # remove duplicated words in a list
    a = set(a)
    b = set(b)
    return (1. - abs(float(intersect(a, b)))/abs(float(union(a, b))))

In [6]:
# Testing

print(intersect(['the', 'long', 'march'],['ides', 'of', 'march']), 'should be 1')
print(union(['the', 'long', 'march'],['ides', 'of', 'march']), 'should be 5')
print(jaccard(['the', 'long', 'march'],['ides', 'of', 'march']), 'should be 0.8')
print(jaccard(['the', 'long', 'march'],['the', 'long', 'march']), 'should be 0.0')
print(jaccard(['the', 'long', 'march'],['a', 'short', 'april']), 'should be 1.0')

1 should be 1
5 should be 5
0.8 should be 0.8
0.0 should be 0.0
1.0 should be 1.0


In [7]:
K = 25

In [8]:
centroid_list = seeds[0:K]
# print(centroidList)

In [9]:
# Helper function to get full tweet object from id:

def getTweet(id):
    return next((x for x in data if x['id'] == id), None)

jaccard(['the', 'long', 'march'],['a', 'short', 'april'])

1.0

In [10]:
# Loop through tweets and assign them a centroid.
def assign_tweets_to_centroids(centList):
    for tweet in data:

        # if already a centroid, set its assigned centroid to itself and go to next tweet
        if tweet['id'] in centList:
            tweet['centroid'] = tweet['id']
            continue

        min_dist = 1 # set our starting Jaccard dist to 1 (max)

        for centroid in centList:

            # calculate the jacard distance to each centroid
            dist = jaccard(tweet['text'],getTweet(centroid)['text'])

            # if it's less than the already assigned centroid, reset it
            if dist < min_dist:
                tweet['centroid'] = centroid        

# assign_tweets_to_centroids()

In [11]:
# for centroid in centroidList:
#     print(centroid)
# cluster = (tweet for tweet in data if tweet['centroid'] == centroid)
# for x in cluster:
#     print(x)

In [12]:
# find k cluster centers

def get_cluster(centroid):
    return (tweet for tweet in data if tweet['centroid'] == centroid)

def update_centroids(K, centList):
    
    new_centroid_list = []
    
    # for each centroid group defined by the id of the centroid
    # loop through each point in the group and find it's average 
    # distance from all the other points in the group
    for centroid in centList:
        
        # only want tweets corresponding to this centroid
        cluster_size = sum(1 for i in get_cluster(centroid))
        
        # initial values
        new_centroid_id = centroid
        min_avg_dist = 1
        
        # loop through the tweets in the cluster
        for current_tweet in get_cluster(centroid):

            #init total dist for min_avg_dist calculation
            total_dist = 0
            
            # compare each one to all the others in the cluster
            for comparison_tweet in get_cluster(centroid):
                total_dist += jaccard(current_tweet['text'], comparison_tweet['text'])
            
            # calculate avg_dist
            avg_dist = total_dist / cluster_size
            
            # update min_avg_dist and new_centroid_id if necessary
            if avg_dist < min_avg_dist:
                min_avg_dist = avg_dist
                new_centroid_id = current_tweet['id']
        
        # create the new centroid list from the new_centroid_id's
        new_centroid_list.append(new_centroid_id)
        
    print('{0}/{1} new centroids chosen'.format(K-intersect(new_centroid_list, centList), len(centList)))
    
    return new_centroid_list
                
        
           
# print(intersect(update_centroids(K, centroidList), centroidList))
# assign_tweets_to_centroids(centroid_list)
# print(centroid_list)



In [13]:
# RUN THE ALGORITHM

if len(centroid_list) > K:
    print('You must supply at least K seed values')
    sys.exit()

if K<2:
    print('K must be an integer value greater than 1')
    sys.exit()
    
# initialize centroids    
assign_tweets_to_centroids(centroid_list)

# While our centroids are still updating
while (intersect(update_centroids(K, centroid_list), centroid_list) < K):

    
    
    # update the centroid_list
    centroid_list = update_centroids(K, centroid_list)
    
    # assign tweets to their closest centroid
    assign_tweets_to_centroids(centroid_list)

print('CONVERGENCE!')

4/24 new centroids chosen
4/24 new centroids chosen
1/24 new centroids chosen
1/24 new centroids chosen
0/24 new centroids chosen
CONVERGENCE!


In [14]:
# Visualize initial clustering
# print(centroid_list)

# centroid_count = [0]*K

# for tweet in data:
#     for x in range(len(centroid_list)):
#         if tweet['centroid'] == centroid_list[x]:
#             centroid_count[x] += 1
# print(centroid_count)        

In [15]:
# for centroid in centroid_list:
#     cluster_list = []
#     for current_tweet in get_cluster(centroid):
#         cluster_list.append(current_tweet['id'])

In [16]:
clusters = []
for centroid in centroid_list:
    cluster = []
    for tweet in get_cluster(centroid):
        cluster.append(tweet['id'])
    clusters.append(cluster)
    
f = open("output.txt","w+")
for idx, centroid in enumerate(centroid_list):
    tweets = ', '.join(str(s) for s in clusters[idx])
    f.write('{0} {1}\n'.format(idx, tweets))
f.close()