In [1]:
import numpy as np
import re
import pandas as pd
from collections import Counter
from math import log

In [2]:
# Sample tweets (Replace with your actual dataset)
tweets = [
    "@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8 advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order",
    "Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P",
    "\"My food stock is not the only one which is empty... PLEASE, don't panic, THERE WILL BE ENOUGH FOOD FOR EVERYONE if you do not take more than you need. Stay calm, stay safe. #COVID19france #COVID_19 #COVID19 #coronavirus #confinement #Confinementotal #ConfinementGeneral https://t.co/zrlG0Z520j\"",
    "\"Me, ready to go at supermarket during the #COVID19 outbreak. Not because I'm paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, don't panic. It causes shortage... #CoronavirusFrance #restezchezvous #StayAtHome #confinement https://t.co/usmuaLq72n\"",
    "As news of the region’s first confirmed COVID-19 case came out of Sullivan County last week, people flocked to area stores to purchase cleaning supplies, hand sanitizer, food, toilet paper and other goods, @Tim_Dodson reports https://t.co/cfXch7a2lU",
    "Cashier at grocery store was sharing his insights on #Covid_19 To prove his credibility he commented \"I'm in Civics class so I know what I'm talking about\". https://t.co/ieFDNeHgDO"
]

In [3]:
# Step 1: Data Cleaning
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions (@username) and hashtags (#hashtag)
    text = re.sub(r'@\S+|#\S+', '', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text

In [4]:
cleaned_tweets = [clean_text(tweet) for tweet in tweets]

In [5]:
# Step 2: Tokenization
def tokenize(text):
    return text.split()

In [6]:
tokenized_text = [tokenize(tweet) for tweet in cleaned_tweets]

In [7]:
# Step 3: Term Frequency Calculation
def calculate_tf(tokens):
    token_count = Counter(tokens)
    total_tokens = len(tokens)
    tf = {term: count / total_tokens for term, count in token_count.items()}
    return tf

In [8]:
tf_scores = [calculate_tf(doc) for doc in tokenized_text]

In [9]:
# Step 4: TF-IDF Calculation (Term Frequency - Inverse Document Frequency)
def calculate_idf(documents, term):
    doc_count = sum(1 for doc in documents if term in doc)
    if doc_count > 0:
        return log(len(documents) / doc_count)
    else:
        return 0

In [10]:
def calculate_tfidf(documents, tf):
    tfidf = {}
    for term, tf_value in tf.items():
        idf_value = calculate_idf(documents, term)
        tfidf[term] = tf_value * idf_value
    return tfidf

In [11]:
tfidf_scores = [calculate_tfidf(tokenized_text, tf) for tf in tf_scores]

In [12]:
# Step 5: Vectorize the data (create a term-document matrix)
all_terms = sorted(set(term for tf in tf_scores for term in tf.keys()))
vectorized_data = []

for tfidf in tfidf_scores:
    row = [tfidf.get(term, 0) for term in all_terms]
    vectorized_data.append(row)

# Convert the vectorized data into a numpy array
X = np.array(vectorized_data)

In [13]:
# Step 6: K-means Clustering
def k_means(X, k, max_iters=100):
    # Randomly initialize centroids by selecting random data points
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]
    prev_centroids = centroids.copy()
    clusters = np.zeros(X.shape[0])

    for _ in range(max_iters):
        # Step 1: Assign clusters based on the closest centroid
        for i, point in enumerate(X):
            distances = np.linalg.norm(point - centroids, axis=1)
            clusters[i] = np.argmin(distances)
        
        # Step 2: Update centroids
        for i in range(k):
            cluster_points = X[clusters == i]
            if len(cluster_points) > 0:
                centroids[i] = np.mean(cluster_points, axis=0)
        
        # If centroids don't change, break early
        if np.all(centroids == prev_centroids):
            break
        
        prev_centroids = centroids.copy()
    
    return centroids, clusters

In [14]:
# Apply K-means clustering
k = 2  # Example: We choose 2 clusters
centroids, clusters = k_means(X, k)

In [15]:
# Step 7: Display the clusters
# Display cluster assignments for each tweet
for i, cluster in enumerate(clusters):
    print(f"Tweet {i + 1}: Cluster {int(cluster)}")

Tweet 1: Cluster 0
Tweet 2: Cluster 0
Tweet 3: Cluster 1
Tweet 4: Cluster 0
Tweet 5: Cluster 0
Tweet 6: Cluster 0
