In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [43]:
# Step 1: Load the data from CSV using pandas
def load_data_from_df(data, y=3):
    if y==3:
        return data
    elif y!=3:
        data = data[data['y']==y]["text"]
    texts = data.tolist()  #data.dropna().tolist()  # Ensure there are no NaN values
    return texts

def load_data(task):
    if(task == "news"):
        df_1 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_fake_news_content.csv")
        df_2 = pd.read_csv("..\\data\\orig\\main\\news\\BuzzFeed_real_news_content.csv")

        df_1['text'] = df_1['title'] + ' ' + df_1['text']
        df_2['text'] = df_2['title'] + ' ' + df_2['text']

        df_1 = df_1[['text']]
        df_2 = df_2[['text']]

        df_1['y'] = 1
        df_2['y'] = 0

        df1_train, df1_test = train_test_split(df_1, random_state=42)
        df2_train, df2_test = train_test_split(df_2, random_state=36)

        df1 = pd.concat([df1_train, df2_train], ignore_index=True)
        df1 = df1.sample(frac=1).reset_index(drop=True)
        df1.dropna(inplace=True)        
        return df1
    
    if(task == "spam"):
        df = pd.read_csv("../data/orig/main/spam/data.csv", encoding='ISO-8859-1')
        df = df[["v1", "v2"]]
        df["v1"] = df["v1"].apply(lambda x: 1 if x=="spam" else 0)
        df.rename(columns={"v1":"y","v2":"text"}, inplace=True)
        df = df.sample(frac=1).reset_index(drop=True)
        df1, df2 = train_test_split(df, test_size=0.2, random_state=65)
        df1.dropna(inplace=True)
        df1 = pd.concat([df1[df1['y']==1], df1[df1['y']==0].sample(n=602)], ignore_index=True)
        return df1
    df = pd.read_csv("../data/orig/main/sentiment/data.csv", encoding='latin-1', header=None)
    df = df[[0,5]]
    df.rename(columns={0:'y',5:'text'}, inplace=True)
    df['y'] = df['y'].apply(lambda x: 1 if x==4 else 0)
    df = df.sample(frac=1).reset_index(drop=True)
    df1, df2  = train_test_split(df, random_state=46, test_size=0.3)
    df1.dropna(inplace=True)
    n = 2500
    df1 = pd.concat([df1[df1['y']==1].sample(n=n), df1[df1['y']==0].sample(n=n)], ignore_index=True)
    return df1


In [46]:


# Step 2: Convert the text data to a TF-IDF matrix
def tfidf_matrix(texts):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = vectorizer.fit_transform(texts)
    return tfidf

# Step 3: Determine optimal clusters using the elbow method and silhouette score
def find_optimal_clusters(tfidf_matrix, max_k):
    inertias = []
    silhouette_scores = []
    idx = 0
    max_sil = 0
    main_model = None
    
    for k in range(2, max_k + 1):
        model = KMeans(n_clusters=k)
        model.fit(tfidf_matrix)
        
        inertia = model.inertia_  # Sum of squared distances to nearest cluster center
        inertias.append(inertia)
        
        # Calculate the silhouette score
        silhouette_avg = silhouette_score(tfidf_matrix, model.labels_)
        silhouette_scores.append(silhouette_avg)

        if(silhouette_avg>max_sil):
            idx = k
            max_sil = silhouette_avg
            main_model = model
        
        # print(f"Cluster: {k}, Inertia: {inertia:.5f}, Silhouette Score: {silhouette_avg:.5f}")

    return inertias, silhouette_scores, idx, max_sil, main_model

# Step 4: Plot the elbow graph and silhouette scores
def plot_metrics(inertias, silhouette_scores, max_k):
    ks = range(2, max_k + 1)
    
    plt.figure(figsize=(12, 6))

    # Plot Inertia (Elbow Method)
    plt.subplot(1, 2, 1)
    plt.plot(ks, inertias, marker='o')
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Inertia")
    plt.title("Elbow Method for Optimal k")

    # Plot Silhouette Score
    plt.subplot(1, 2, 2)
    plt.plot(ks, silhouette_scores, marker='o', color='green')
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Silhouette Score")
    plt.title("Silhouette Score for Optimal k")

    plt.show()

def get_mini_train_data(task):
    return "../data/orig/processed/train/"+task+"-data-mini.csv"

def find_cluster_groupings(task, max_k):
    csv_file = get_mini_train_data(task) # Replace with your CSV file path

    texts = load_data(task)
    texts = texts[texts['y']==0]["text"]
    texts = texts.tolist() 
    tfidf = tfidf_matrix(texts)
    inertias, silhouette_scores, idx, max_sil, model0 = find_optimal_clusters(tfidf, max_k)
    print(f"Task: {task}, Max at {idx}, Silhouette Score: ", max_sil)
    # plot_metrics(inertias, silhouette_scores, max_k)

    texts = load_data(task)    
    texts = texts[texts['y']==1]["text"]
    texts = texts.tolist() 
    tfidf = tfidf_matrix(texts)
    inertias, silhouette_scores, idx, max_sil, model1 = find_optimal_clusters(tfidf, max_k)
    print(f"Task: {task}, Max at {idx}, Silhouette Score: ", max_sil)
    # plot_metrics(inertias, silhouette_scores, max_k)
    return model0, model1


In [33]:
tasks = {
    'news':{
        'max_k':3
    },
    'spam':{
        'max_k':5
    },
    'sentiment':{
        'max_k':7
    }
}

In [47]:
for task in tasks:
    model0, model1 = find_cluster_groupings(task, tasks[task]['max_k'])
    tasks[task]['model0'] = model0
    tasks[task]['model1'] = model1

    df = load_data(task)
    df0 = df[df['y']==0]
    df0['fsl_category'] = tasks[task]['model0'].fit_predict(tfidf_matrix(df0['text']))
    print(df0['fsl_category'].value_counts())
    df1 = df[df['y']==1]
    df1['fsl_category'] = tasks[task]['model1'].fit_predict(tfidf_matrix(df1['text']))
    print(df1['fsl_category'].value_counts())
    print(task)
    df = pd.concat([df1, df0], ignore_index=True)  
    df.to_csv('../data/syn/mid/fsl-cat-'+task+'.csv', index=False)

Task: news, Max at 3, Silhouette Score:  0.008976069687305396
Task: news, Max at 3, Silhouette Score:  0.019413893294439644


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['fsl_category'] = tasks[task]['model0'].fit_predict(tfidf_matrix(df0['text']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['fsl_category'] = tasks[task]['model1'].fit_predict(tfidf_matrix(df1['text']))


fsl_category
0    52
1     8
2     8
Name: count, dtype: int64
fsl_category
1    31
2    23
0    14
Name: count, dtype: int64
news
Task: spam, Max at 4, Silhouette Score:  0.008337666591837307
Task: spam, Max at 5, Silhouette Score:  0.014071664507477865


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['fsl_category'] = tasks[task]['model0'].fit_predict(tfidf_matrix(df0['text']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['fsl_category'] = tasks[task]['model1'].fit_predict(tfidf_matrix(df1['text']))


fsl_category
3    447
2     69
1     62
0     24
Name: count, dtype: int64
fsl_category
2    203
1    190
4    102
3     80
0     31
Name: count, dtype: int64
spam
Task: sentiment, Max at 7, Silhouette Score:  0.006558214956034779
Task: sentiment, Max at 7, Silhouette Score:  0.0077828788292062315
fsl_category
0    1761
2     187
1     180
3     115
5     101
4      88
6      68
Name: count, dtype: int64
fsl_category
2    1766
0     164
3     134
1     131
5     125
6     123
4      57
Name: count, dtype: int64
sentiment


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['fsl_category'] = tasks[task]['model0'].fit_predict(tfidf_matrix(df0['text']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['fsl_category'] = tasks[task]['model1'].fit_predict(tfidf_matrix(df1['text']))
