In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Assuming you have a DataFrame `data` with columns 'Extracted_keyword' and 'field_of_study'
# Example: data = pd.read_csv('your_file.csv')
input_file = "data.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)
# Prepare the TfidfVectorizer with ngram_range to capture bigrams or trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # (1, 2) captures unigrams and bigrams

# Apply KMeans clustering for each field of study
field_of_study_list = data["subjectArea"].unique()  # List of unique fields of study

# Create a dictionary to store the clustering results by field of study
field_clustering_results = {}

for field in field_of_study_list:
    # Filter the data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Prepare the text data for TF-IDF
    sentences = field_data["Top_Two_Keywords"]
    
    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Apply KMeans clustering (choose the number of clusters)
    num_clusters = 5  # You can modify this depending on your data
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Add the cluster labels to the DataFrame for the current field of study
    field_data['cluster'] = kmeans.labels_
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Get the indices of the largest cluster
    largest_cluster_indices = np.where(kmeans.labels_ == largest_cluster)[0]
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Get the top keywords of the centroid (sorted by their importance)
    top_keywords_indices = centroid.argsort()[-10:][::-1]  # Top 10 keywords
    top_keywords = [vectorizer.get_feature_names_out()[i] for i in top_keywords_indices]
    
    # Store the results in the dictionary
    field_clustering_results[field] = {
        'top_keywords': top_keywords,
        'field_data': field_data,
        'largest_cluster': largest_cluster,
        'cluster_centroids': kmeans.cluster_centers_
    }

# Print the top keywords for each field of study
for field, results in field_clustering_results.items():
    print(f"Field of Study: {field}")
    print(f"Top Keywords: {results['top_keywords']}")
    print()


KeyError: 'Top_Two_Keywords'