In [27]:
pip install tweet-preprocessor



In [28]:
import pandas as pd
import preprocessor as p
import random

# Preprocessing

In [29]:
data_url = "https://raw.githubusercontent.com/eliannaevans/kmeans_tweet_clustering/af88a362c710aeb7733eb47deca085da59fd95c3/bbchealth.txt"
data = pd.read_csv(data_url, sep='|', names=['id','time','tweet'])
data

Unnamed: 0,id,time,tweet
0,585978391360221184,Thu Apr 09 01:31:50 +0000 2015,Breast cancer risk test devised http://bbc.in/...
1,585947808772960257,Wed Apr 08 23:30:18 +0000 2015,GP workload harming care - BMA poll http://bbc...
2,585947807816650752,Wed Apr 08 23:30:18 +0000 2015,Short people's 'heart risk greater' http://bbc...
3,585866060991078401,Wed Apr 08 18:05:28 +0000 2015,New approach against HIV 'promising' http://bb...
4,585794106170839041,Wed Apr 08 13:19:33 +0000 2015,Coalition 'undermined NHS' - doctors http://bb...
...,...,...,...
3924,384766023120871424,Mon Sep 30 19:45:43 +0000 2013,Baby born after ovaries 'reawakened' http://bb...
3925,384700230920175617,Mon Sep 30 15:24:17 +0000 2013,Identical triplets born against odds http://bb...
3926,384678543088562178,Mon Sep 30 13:58:06 +0000 2013,Hospital failed to make improvements http://bb...
3927,384678542455222273,Mon Sep 30 13:58:06 +0000 2013,New patient targets pledge for NHS http://bbc....


In [30]:
# remove id and timestamp
data = data.drop(labels=['id', 'time'], axis=1)
data

Unnamed: 0,tweet
0,Breast cancer risk test devised http://bbc.in/...
1,GP workload harming care - BMA poll http://bbc...
2,Short people's 'heart risk greater' http://bbc...
3,New approach against HIV 'promising' http://bb...
4,Coalition 'undermined NHS' - doctors http://bb...
...,...
3924,Baby born after ovaries 'reawakened' http://bb...
3925,Identical triplets born against odds http://bb...
3926,Hospital failed to make improvements http://bb...
3927,New patient targets pledge for NHS http://bbc....


In [31]:
p.set_options(p.OPT.MENTION, p.OPT.URL)
cleaned_tweets = data

for i, tweet in enumerate(data['tweet']):
  tweet = tweet.lower() # bring all text to lowercase
  tweet.replace('#', '') # remove hash symbol
  cleaned_tweets.loc[i, 'tweet'] = p.clean(tweet) # remove @ mentions, urls

cleaned_tweets

Unnamed: 0,tweet
0,breast cancer risk test devised
1,gp workload harming care - bma poll
2,short people's 'heart risk greater'
3,new approach against hiv 'promising'
4,coalition 'undermined nhs' - doctors
...,...
3924,baby born after ovaries 'reawakened'
3925,identical triplets born against odds
3926,hospital failed to make improvements
3927,new patient targets pledge for nhs


# K-means clustering functions

In [32]:
def init_centroids(cleaned_tweets, k): # pick random points to be the first centroids
  centroids = []
  for cluster in range(k):
    row = None
    while True:
      row = random.randint(0, len(cleaned_tweets.index))
      if row not in centroids: break
    centroids.append(row)
  print("Initial centroids picked")
  return centroids

In [33]:
# Jaccard distance function
def jaccard_distance(tweet1, tweet2):
  return 1 - (1.0*intersection(tweet1, tweet2) / union(tweet1, tweet2))

# intersection function
def intersection(tweet1, tweet2):
  intersection = 0
  tweet1_words = list(dict.fromkeys(tweet1.split())) # remove duplicate words
  tweet2_words = list(dict.fromkeys(tweet2.split()))

  for word in tweet1_words:
    if word in tweet2_words:
        intersection += 1

  return intersection

# union function
def union(tweet1, tweet2):
  tweet1_words = list(dict.fromkeys(tweet1.split())) # remove duplicate words
  tweet2_words = list(dict.fromkeys(tweet2.split()))
  union = len(tweet1_words) + len(tweet2_words)

  for word1 in tweet1_words:
    for word2 in tweet2_words:
      if word1 == word2: union -= 1

  return union

In [34]:
# function to get tweet given row_index
def twt(cleaned_tweets, row_index):
  return cleaned_tweets.loc[row_index, 'tweet']

In [35]:
def assign_cluster(cleaned_tweets, centroids):
  # create 'clusters' via dictionary to be added to
  cluster_dict = {}
  for centroid in centroids:
    cluster_dict[centroid] = []

  # assign points to clusters based on centroid distance
  for row_index in range(len(cleaned_tweets.index)):
    smallest_distance = jaccard_distance(twt(cleaned_tweets, centroids[0]), twt(cleaned_tweets, row_index)) # auto set smallest distance to first centroid
    closest_centroid = centroids[0]

    # find closest centroid to given tweet
    for centroid in centroids:
      distance = jaccard_distance(twt(cleaned_tweets, centroid), twt(cleaned_tweets, row_index))
      if distance < smallest_distance:
        smallest_distance = distance
        closest_centroid = centroid

    # assign tweet to respective cluster
    cluster_dict[closest_centroid].append(row_index)

  print("Cluster assigned")
  return cluster_dict

In [36]:
# compute sum of squared error
def sse(cleaned_tweets, cluster_dict):
  sse = 0
  for centroid in list(cluster_dict.keys()):
    for tweet in cluster_dict[centroid]:
      sse += (jaccard_distance(twt(cleaned_tweets, centroid), twt(cleaned_tweets, tweet)))**2.0
  return sse

In [37]:
def new_centroid(cleaned_tweets, cluster_list):
  new_centroid = cluster_list[0]
  smallest_avg_distance = 0
  for point in cluster_list:
    smallest_avg_distance += jaccard_distance(twt(cleaned_tweets, new_centroid), twt(cleaned_tweets, point))
  smallest_avg_distance /= 1.0*len(cluster_list)

  for pot_centroid in cluster_list:
    avg_distance = 0
    for point in cluster_list:
      avg_distance += jaccard_distance(twt(cleaned_tweets, pot_centroid), twt(cleaned_tweets, point))
    avg_distance /= 1.0*len(cluster_list)
    if avg_distance < smallest_avg_distance:
      smallest_avg_distance = avg_distance
      new_centroid = pot_centroid
    
  return new_centroid

def find_centroids(cleaned_tweets, cluster_dict): # reassign centroids based on cluster average distance
  new_cluster_dict = {}
  for old_centroid in list(cluster_dict.keys()):
    cluster = cluster_dict[old_centroid]
    new_cluster_dict[new_centroid(cleaned_tweets, cluster)] = cluster_dict[old_centroid]

  print("New centroids calculated")
  return new_cluster_dict

In [53]:
def get_cluster_sizes(cluster_dict):
  cluster_sizes = {}
  for centroid, cluster_num in zip(list(cluster_dict.keys()), range(1, len(list(cluster_dict.keys())) + 1)):
    cluster_sizes[cluster_num] = len(cluster_dict[centroid])
  return cluster_sizes

In [55]:
# function for running the k-means clustering algorithm
def k_means_cluster(cleaned_tweets, k):
  centroids = init_centroids(cleaned_tweets, k) # initialize random points as centroids
  while True:
    cluster_dict = assign_cluster(cleaned_tweets, centroids) # assign points to each centroid, getting dictionary with centroids as keys, member points as values
    new_cluster_dict = find_centroids(cleaned_tweets, cluster_dict) # reassign centroid as smallest distance to all member tweets
    centroids = list(new_cluster_dict.keys())
    print(f"SSE : {sse(cleaned_tweets, cluster_dict)}")
    if cluster_dict == new_cluster_dict: break # centroids have not changed, clustering is complete
    else: cluster_dict = new_cluster_dict # another iteration is required
  return {'Value of K': k, 'SSE': sse(cleaned_tweets, cluster_dict), 'Size of each cluster': get_cluster_sizes(cluster_dict)}

# Run K-means clustering

In [42]:
table = pd.DataFrame(columns=['Value of K', 'SSE', 'Size of each cluster'])
'''for k in [2, 5, 10, 20, 50, 100]:
  table.append(k_means_cluster(cleaned_tweets, k))
  print(f"Finished k = {k}")

table'''

'for k in [2, 5, 10, 20, 50, 100]:\n  table.append(k_means_cluster(cleaned_tweets, k))\n  print(f"Finished k = {k}")\n\ntable'

In [48]:
k_10_dict = k_means_cluster(cleaned_tweets, 10) # rumtime approx. 5 minutes
table = table.append(k_10_dict, ignore_index=True)
table

Initial centroids picked
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated


Unnamed: 0,Value of K,SSE,Size of each cluster


In [49]:
k_10_dict

{'SSE': 3279.904661786569,
 'Size of each cluster': {0: 1689,
  1: 679,
  2: 160,
  3: 316,
  4: 466,
  5: 323,
  6: 132,
  7: 70,
  8: 20,
  9: 74},
 'Value of K': 10}

In [50]:
k_20_dict = k_means_cluster(cleaned_tweets, 20) # runtime approx. 2 minutes
table = table.append(k_20_dict, ignore_index=True)
table

Initial centroids picked
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated


Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."


In [52]:
k_20_dict

{'SSE': 3165.9945778208366,
 'Size of each cluster': {0: 1066,
  1: 555,
  2: 212,
  3: 118,
  4: 59,
  5: 322,
  6: 266,
  7: 253,
  8: 52,
  9: 239,
  10: 210,
  11: 41,
  12: 52,
  13: 305,
  14: 16,
  15: 37,
  16: 23,
  17: 29,
  18: 19,
  19: 55},
 'Value of K': 20}

In [54]:
k_50_dict = k_means_cluster(cleaned_tweets, 50) # runtime approx. 1 minute
table = table.append(k_50_dict, ignore_index=True)
print(k_50_dict)
table

Initial centroids picked
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated
Cluster assigned
New centroids calculated
{'Value of K': 50, 'SSE': 2956.6062790790884, 'Size of each cluster': {1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7: 324, 8: 97, 9: 17, 10: 112, 11: 25, 12: 32, 13: 49, 14: 44, 15: 25, 16: 18, 17: 4, 18: 34, 19: 59, 20: 224, 21: 65, 22: 212, 23: 100, 24: 14, 25: 21, 26: 62, 27: 49, 28: 9, 29: 229, 30: 29, 31: 138, 32: 23, 33: 20, 34: 24, 35: 17, 36: 93, 37: 10, 38: 43, 39: 58, 40: 119, 41: 103, 42: 47, 43: 88, 44: 9, 45: 22, 46: 8, 47: 61, 48: 183, 49: 44, 50: 43}}


Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."
1,10,3279.904662,"{0: 1689, 1: 679, 2: 160, 3: 316, 4: 466, 5: 3..."
2,50,2956.606279,"{1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7:..."


In [56]:
k_75_dict = k_means_cluster(cleaned_tweets, 75) # runtime approx. 1 minute
table = table.append(k_75_dict, ignore_index=True)
print(k_75_dict)
table

Initial centroids picked
Cluster assigned
New centroids calculated
SSE : 2981.7272247368487
Cluster assigned
New centroids calculated
SSE : 2872.8246808665217
Cluster assigned
New centroids calculated
SSE : 2863.884970034587
{'Value of K': 75, 'SSE': 2863.884970034587, 'Size of each cluster': {1: 358, 2: 7, 3: 10, 4: 22, 5: 65, 6: 126, 7: 17, 8: 52, 9: 54, 10: 134, 11: 134, 12: 26, 13: 44, 14: 18, 15: 129, 16: 16, 17: 11, 18: 90, 19: 71, 20: 26, 21: 14, 22: 14, 23: 100, 24: 8, 25: 14, 26: 33, 27: 104, 28: 29, 29: 14, 30: 130, 31: 22, 32: 26, 33: 31, 34: 4, 35: 7, 36: 10, 37: 9, 38: 48, 39: 153, 40: 16, 41: 157, 42: 6, 43: 200, 44: 29, 45: 15, 46: 26, 47: 7, 48: 34, 49: 20, 50: 21, 51: 79, 52: 38, 53: 96, 54: 14, 55: 10, 56: 16, 57: 19, 58: 49, 59: 38, 60: 19, 61: 29, 62: 22, 63: 121, 64: 22, 65: 37, 66: 218, 67: 19, 68: 24, 69: 13, 70: 47, 71: 4, 72: 66, 73: 11, 74: 22, 75: 185}}


Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."
1,10,3279.904662,"{0: 1689, 1: 679, 2: 160, 3: 316, 4: 466, 5: 3..."
2,50,2956.606279,"{1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7:..."
3,75,2863.88497,"{1: 358, 2: 7, 3: 10, 4: 22, 5: 65, 6: 126, 7:..."


In [57]:
k_100_dict = k_means_cluster(cleaned_tweets, 100) # runtime approx. 1 minute
table = table.append(k_100_dict, ignore_index=True)
print(k_100_dict)
table

Initial centroids picked
Cluster assigned
New centroids calculated
SSE : 2880.152715819818
Cluster assigned
New centroids calculated
SSE : 2800.7582888822503
Cluster assigned
New centroids calculated
SSE : 2797.411761199592
{'Value of K': 100, 'SSE': 2797.411761199592, 'Size of each cluster': {1: 286, 2: 6, 3: 14, 4: 31, 5: 15, 6: 44, 7: 27, 8: 39, 9: 77, 10: 6, 11: 11, 12: 60, 13: 35, 14: 105, 15: 10, 16: 323, 17: 149, 18: 34, 19: 23, 20: 17, 21: 40, 22: 24, 23: 26, 24: 35, 25: 7, 26: 96, 27: 38, 28: 3, 29: 4, 30: 9, 31: 10, 32: 71, 33: 22, 34: 47, 35: 4, 36: 11, 37: 90, 38: 13, 39: 65, 40: 7, 41: 59, 42: 67, 43: 10, 44: 55, 45: 15, 46: 30, 47: 132, 48: 40, 49: 12, 50: 8, 51: 179, 52: 26, 53: 175, 54: 10, 55: 46, 56: 51, 57: 56, 58: 11, 59: 18, 60: 25, 61: 6, 62: 14, 63: 5, 64: 10, 65: 14, 66: 14, 67: 99, 68: 46, 69: 34, 70: 14, 71: 15, 72: 83, 73: 3, 74: 8, 75: 18, 76: 10, 77: 11, 78: 6, 79: 57, 80: 28, 81: 2, 82: 26, 83: 35, 84: 79, 85: 22, 86: 47, 87: 12, 88: 36, 89: 43, 90: 24, 91

Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."
1,10,3279.904662,"{0: 1689, 1: 679, 2: 160, 3: 316, 4: 466, 5: 3..."
2,50,2956.606279,"{1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7:..."
3,75,2863.88497,"{1: 358, 2: 7, 3: 10, 4: 22, 5: 65, 6: 126, 7:..."
4,100,2797.411761,"{1: 286, 2: 6, 3: 14, 4: 31, 5: 15, 6: 44, 7: ..."


In [58]:
k_200_dict = k_means_cluster(cleaned_tweets, 200) # runtime approx. 2 minutes
table = table.append(k_200_dict, ignore_index=True)
print(k_200_dict)
table

Initial centroids picked
Cluster assigned
New centroids calculated
SSE : 2657.9659112467966
Cluster assigned
New centroids calculated
SSE : 2602.8524768320613
Cluster assigned
New centroids calculated
SSE : 2592.0857182769487
Cluster assigned
New centroids calculated
SSE : 2590.9611364740717
Cluster assigned
New centroids calculated
SSE : 2588.1119708875726
Cluster assigned
New centroids calculated
SSE : 2587.6209763383895
{'Value of K': 200, 'SSE': 2587.6209763383895, 'Size of each cluster': {1: 199, 2: 43, 3: 11, 4: 13, 5: 115, 6: 71, 7: 53, 8: 13, 9: 12, 10: 26, 11: 18, 12: 39, 13: 71, 14: 27, 15: 9, 16: 2, 17: 3, 18: 11, 19: 154, 20: 3, 21: 19, 22: 6, 23: 11, 24: 48, 25: 3, 26: 36, 27: 3, 28: 34, 29: 8, 30: 41, 31: 40, 32: 11, 33: 9, 34: 88, 35: 11, 36: 118, 37: 14, 38: 26, 39: 6, 40: 58, 41: 56, 42: 3, 43: 18, 44: 3, 45: 21, 46: 19, 47: 27, 48: 104, 49: 36, 50: 21, 51: 14, 52: 33, 53: 8, 54: 1, 55: 24, 56: 5, 57: 40, 58: 14, 59: 87, 60: 4, 61: 11, 62: 16, 63: 6, 64: 104, 65: 14, 6

Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."
1,10,3279.904662,"{0: 1689, 1: 679, 2: 160, 3: 316, 4: 466, 5: 3..."
2,50,2956.606279,"{1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7:..."
3,75,2863.88497,"{1: 358, 2: 7, 3: 10, 4: 22, 5: 65, 6: 126, 7:..."
4,100,2797.411761,"{1: 286, 2: 6, 3: 14, 4: 31, 5: 15, 6: 44, 7: ..."
5,200,2587.620976,"{1: 199, 2: 43, 3: 11, 4: 13, 5: 115, 6: 71, 7..."


In [59]:
k_500_dict = k_means_cluster(cleaned_tweets, 500) # runtime approx. 5 minutes
table = table.append(k_500_dict, ignore_index=True)
print(k_500_dict)
table

Initial centroids picked
Cluster assigned
New centroids calculated
SSE : 2278.444182942876
Cluster assigned
New centroids calculated
SSE : 2220.403787579672
Cluster assigned
New centroids calculated
SSE : 2207.763254100611
Cluster assigned
New centroids calculated
SSE : 2202.691794662369
Cluster assigned
New centroids calculated
SSE : 2200.832655996093
Cluster assigned
New centroids calculated
SSE : 2199.1060343908016
{'Value of K': 500, 'SSE': 2199.1060343908016, 'Size of each cluster': {1: 38, 2: 12, 3: 4, 4: 15, 5: 7, 6: 18, 7: 8, 8: 32, 9: 9, 10: 4, 11: 13, 12: 1, 13: 14, 14: 3, 15: 3, 16: 17, 17: 34, 18: 14, 19: 1, 20: 5, 21: 2, 22: 2, 23: 7, 24: 21, 25: 5, 26: 6, 27: 9, 28: 5, 29: 5, 30: 10, 31: 4, 32: 4, 33: 8, 34: 4, 35: 5, 36: 10, 37: 7, 38: 1, 39: 6, 40: 8, 41: 1, 42: 18, 43: 66, 44: 4, 45: 8, 46: 4, 47: 4, 48: 1, 49: 15, 50: 3, 51: 14, 52: 11, 53: 19, 54: 5, 55: 3, 56: 17, 57: 1, 58: 1, 59: 18, 60: 18, 61: 29, 62: 23, 63: 7, 64: 10, 65: 13, 66: 4, 67: 15, 68: 11, 69: 12, 70:

Unnamed: 0,Value of K,SSE,Size of each cluster
0,20,3165.994578,"{0: 1066, 1: 555, 2: 212, 3: 118, 4: 59, 5: 32..."
1,10,3279.904662,"{0: 1689, 1: 679, 2: 160, 3: 316, 4: 466, 5: 3..."
2,50,2956.606279,"{1: 634, 2: 111, 3: 93, 4: 7, 5: 42, 6: 35, 7:..."
3,75,2863.88497,"{1: 358, 2: 7, 3: 10, 4: 22, 5: 65, 6: 126, 7:..."
4,100,2797.411761,"{1: 286, 2: 6, 3: 14, 4: 31, 5: 15, 6: 44, 7: ..."
5,200,2587.620976,"{1: 199, 2: 43, 3: 11, 4: 13, 5: 115, 6: 71, 7..."
6,500,2199.106034,"{1: 38, 2: 12, 3: 4, 4: 15, 5: 7, 6: 18, 7: 8,..."


In [60]:
table.to_excel('table.xlsx')