In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Assuming you have a DataFrame data with columns 'Extracted_keyword' and 'field_of_study'
# Example: data = pd.read_csv('your_file.csv')
input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)
# Prepare the TfidfVectorizer with ngram_range to capture bigrams or trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # (1, 2) captures unigrams and bigrams

# Apply KMeans clustering for each field of study
field_of_study_list = data["subjectArea"].unique()  # List of unique fields of study

# Create a dictionary to store the clustering results by field of study
field_clustering_results = {}

for field in field_of_study_list:
    # Filter the data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Prepare the text data for TF-IDF
    sentences = field_data["extracted_keywords"]
    
    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Apply KMeans clustering (choose the number of clusters)
    num_clusters = 5  # You can modify this depending on your data
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Add the cluster labels to the DataFrame for the current field of study
    field_data['cluster'] = kmeans.labels_
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Get the indices of the largest cluster
    largest_cluster_indices = np.where(kmeans.labels_ == largest_cluster)[0]
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Get the top keywords of the centroid (sorted by their importance)
    top_keywords_indices = centroid.argsort()[-10:][::-1]  # Top 10 keywords
    top_keywords = [vectorizer.get_feature_names_out()[i] for i in top_keywords_indices]
    
    # Store the results in the dictionary
    field_clustering_results[field] = {
        'top_keywords': top_keywords,
        'field_data': field_data,
        'largest_cluster': largest_cluster,
        'cluster_centroids': kmeans.cluster_centers_
    }

# Print the top keywords for each field of study
for field, results in field_clustering_results.items():
    print(f"Field of Study: {field}")
    print(f"Top Keywords: {results['top_keywords']}")
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Field of Study: MATE
Top Keywords: ['ni', 'tio2', 'pla', 'graphene', 'cellulose', 'zno', 'films', 'geopolymer', 'nr', 'concrete']

Field of Study: BUSI
Top Keywords: ['innovation', 'board', 'takeover', 'brand', 'csr', 'corporate', 'smes', 'commerce', 'financial', 'earnings']

Field of Study: HEAL
Top Keywords: ['pharmacists', 'balance', 'training', 'foot', 'exercise', 'foot diabetic', 'diabetic', 'turnover', 'srp', 'arch']

Field of Study: CHEM
Top Keywords: ['nr', 'co2', 'films', 'cu2', 'pla', 'tio2', 'bc', 'cmc', 'ni', 'cd']

Field of Study: MEDI
Top Keywords: ['hiv', 'aki', 'liver', 'covid', 'pd', 'cancer', 'kidney', 'sleep', 'health', 'hpv']

Field of Study: MULT
Top Keywords: ['covid', 'ethanol', 'pdl', '19', 'covid 19', 'tlc', 'flour', 'slag', 'banana', 'geopolymer']

Field of Study: PHYS
Top Keywords: ['proton', 'galaxies', 'jet', 'tev', 'theories', 'jets', 'dark', 'alma', 'squark', 'neutrino']

Field of Study: NEUR
Top Keywords: ['schizophrenia', 'mdd', 'pd', 'stroke', 'igm', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Load data
input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Prepare the TfidfVectorizer with ngram_range to capture only unigrams (single words)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))  # Ensure only single words are captured

# Get the list of unique fields of study
field_of_study_list = data["subjectArea"].unique()

# List to store the results
results = []

for field in field_of_study_list:
    # Filter data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Extract the text for TF-IDF
    sentences = field_data["extracted_keywords"]
    
    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Apply KMeans clustering
    num_clusters = 5  # Define the number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Assign cluster labels to the DataFrame
    field_data['cluster'] = kmeans.labels_
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Identify the single top keyword from the centroid
    top_keyword_index = centroid.argmax()  # Index of the highest value in the centroid
    top_keyword = vectorizer.get_feature_names_out()[top_keyword_index]  # Get the keyword
    
    # Append the field and the top keyword to the results
    results.append({"field_of_study": field, "top_keyword": top_keyword})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file = "top_keywords_by_field.csv"
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


Results saved to top_keywords_by_field.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [8]:
results_df

Unnamed: 0,field_of_study,top_keyword
0,MATE,ni
1,BUSI,innovation
2,HEAL,pharmacists
3,CHEM,cu2
4,MEDI,hiv
5,MULT,covid
6,PHYS,proton
7,NEUR,mdd
8,CENG,pt
9,ENGI,steel


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus = [
    'donation , organ',
]
X = vectorizer.fit_transform(corpus)
for i in X:
  print(i)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 2)>
  Coords	Values
  (0, 0)	0.7071067811865475
  (0, 1)	0.7071067811865475


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

field_data = data[data["subjectArea"] == 'MEDI']
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    
# Prepare the text data for TF-IDF
sentences = field_data["extracted_keywords"]

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Apply KMeans clustering (choose the number of clusters)
num_clusters = 5  # You can modify this depending on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

x = tfidf_matrix.toarray()
for i in x[0]:
  print(i)





0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [None]:
data.loc[data['subjectArea'] == 'MEDI']

Unnamed: 0.1,Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,extracted_keywords
4,5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,natco rights reserved introduction population ...,Choice Consumer wellness Decision-making Neigh...,MEDI,01/03/2018,The influence of neighbor effect and urbanizat...,"donation , organ"
6,8,Lowered quality of life in mood disorders is a...,Journal of Evaluation in Clinical Practice,john wiley sons ltd. rationale aims major af...,bipolar disorder child abuse depressive disord...,MEDI,01/08/2018,Lowered quality of life in mood disorders is a...,"hrqol , tsh"
8,10,Current practice of diagnosis and management o...,Journal of Critical Care,elsevier inc. purpose resource limited setti...,Acute kidney injury ICU Practice Resource limi...,MEDI,01/08/2018,Current practice of diagnosis and management o...,"aki , settings"
11,15,Falls among physically active elderly in senio...,Clinical Interventions in Aging,maneeprom et al purpose mixed method study a...,Elderly Fall Fall prevention Perception Senior...,MEDI,01/01/2018,Falls among physically active elderly in senio...,"fall , falls"
12,16,Clif-sofa and urine neutrophil gelatinase-asso...,Journal of the Medical Association of Thailand,medical association thailand rights reserved o...,Acute-on-chronic liver failure Cirrhosis Morta...,MEDI,01/11/2018,Clif-sofa and urine neutrophil gelatinase-asso...,"clif , aclf"
...,...,...,...,...,...,...,...,...,...
16289,20185,Comparison of full-endoscopic and tubular-base...,European Spine Journal,author(s exclusive licence springer verlag gmb...,Decompression Full-endoscopic spine surgery Lu...,MEDI,01/08/2023,Comparison of full-endoscopic and tubular-base...,"decompression , surgery"
16291,20187,Effect of Smilax spp. and Phellinus linteus co...,BMC Complementary Medicine and Therapies,author(s).background prevalence breast cancer ...,Adjuvant drug Breast cancer Herbal medicine Ph...,MEDI,01/12/2023,Effect of Smilax spp. and Phellinus linteus co...,"pss , pl"
16299,20195,Clinical outcomes of low-dose pharmacokinetic-...,Haemophilia,john wiley sons ltd. introduction despite re...,coagulation factor VIII haemophilia A pharmaco...,MEDI,01/01/2023,Clinical outcomes of low-dose pharmacokinetic-...,"fviii , ehl"
16310,20206,Inhibition of histone deacetylase 6 destabiliz...,Journal of Biomedical Science,author(s).background leading cause cancer rela...,Extracellular signal-regulated kinase (ERK) Gl...,MEDI,01/12/2023,Inhibition of histone deacetylase 6 destabiliz...,"hdac6 , erk"


In [22]:
from gensim.models import Word2Vec
df = pd.read_csv('data_chula2.csv')
df.loc[df['subjectArea'] == 'MEDI']
sample = df.loc[df['subjectArea'] == 'MEDI'][:10]
sample

w2v_model = Word2Vec(sentences=df5['Tokenized'], vector_size=100, window=5, min_count=1, workers=4)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject