### Config

In [1]:
import json
import pandas as pd
import numpy as np
import os

In [2]:
import spacy
nlp_spacy = spacy.load("en_core_web_sm")

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabrielvictorgomesferreira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielvictorgomesferreira/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import gensim
from gensim.models.phrases import Phraser, Phrases

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Load the config file
with open('../config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

### Datasets

In [6]:
# Read trait data
file_name = "Trait dictionary.txt"
final_path = os.path.join(file_path, file_name) 

# Read trait dictionary file
with open(final_path, "r", encoding="utf-8") as file:
    trait_phrases = set([phrase.lower().strip() for phrase in file if phrase.strip()])

print(f"Number of good phrases in the trait dictionary: {len(trait_phrases)}")
print(trait_phrases)

Number of good phrases in the trait dictionary: 22719
{'orbit symmetry', 'calculated horizontal distance resulting from voluntary locomotion in an experimental apparatus', 'extraembryonic tissue physiology trait', 'tissue weight of the crop', 'aorta wall total intracellular protein', 'pancreas weight as a percentage of body weight', 'cerebral hemispheres morphology trait', 'kidney-specific lymphocyte tracer radioactivity measurement', 'benign hepatic tumor incidence', 'testis rna composition', 'kcl response/sensitivity', 'atrial auricle morphology trait', 'total litter size', 'dressed carcass muscle-to-bone ratio', 'meat fatty acid', 'liver mineral', 'left renal fat pad weight', 'yolk sac color', 'otolithic membrane', 'post-insult time to onset of hsv encephalitis', 'milk fatty acid trans-11,cis-15-c18', 'mast cell development trait', 'gaba neuron number', 'atrial septal morphology trait', 'percentage of study population developing kidney tubular degeneration during a period of time', 

### Pre-Processing

In [7]:
# Set stop words
stop_words = set(stopwords.words('english'))

# Set lemmatizer
lemmatizer = WordNetLemmatizer()

# Read json file
file_name = "QTL_text.json"
final_path = os.path.join(file_path, file_name) 
df = pd.read_json(final_path)

# Pre Process
df_processed = df[['Abstract', 'Category']]
df_processed = df_processed[df_processed['Category'] == 1]

abstract_tokenized = df_processed['Abstract']
abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.lower()) # Lowercasing txt
abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.translate(str.maketrans("", "", string.punctuation))) # Removing punctuations from the text
abstract_tokenized = abstract_tokenized.apply(lambda doc: re.sub(r"\d+", "", doc)) # Removing numbers from the text
abstract_tokenized = abstract_tokenized.apply(lambda doc: [token for token in word_tokenize(doc) if token not in stop_words]) # Tokenizing and removing stop words from the text
abstract_tokenized = abstract_tokenized.apply(lambda doc: [lemmatizer.lemmatize(token, pos=wordnet.NOUN) for token in doc]) # Convert words to their base
abstract_tokenized = abstract_tokenized.apply(lambda doc: [token.strip() for token in doc if token.strip() and len(token)>1]) # Removing extra space

df_processed['abstract_nltk'] = abstract_tokenized
abstract_tokenized

0        [previous, study, qtl, carcass, composition, m...
3        [wholegenome, quantitative, trait, locus, qtl,...
5        [partial, genome, scan, using, microsatellite,...
7        [background, rate, pubertal, development, wean...
10       [previously, quantitative, trait, locus, qtl, ...
                               ...                        
11203    [copy, number, variation, cnvs, major, source,...
11204    [body, size, important, indicator, growth, hea...
11207    [genomewide, association, study, gwas, perform...
11217    [gastrointestinal, nematode, gin, infection, n...
11220    [animal, temperament, defined, consistent, beh...
Name: Abstract, Length: 1007, dtype: object

### Phrase Extraction

#### gensim.models.phrases

In [8]:
## Bigram
bigram = Phraser(Phrases(abstract_tokenized, min_count=2, threshold=10))
bigram_token = [bigram[doc] for doc in abstract_tokenized]
bigram_text = [" ".join(token) for token in bigram_token]

## Trigram
trigram = Phraser(Phrases(bigram[abstract_tokenized], min_count=2, threshold=10))
trigram_token = [trigram[bigram[doc]] for doc in abstract_tokenized]
trigram_text = [" ".join(token) for token in trigram_token]

print(bigram_token)
print(bigram_text)

[['previous_study', 'qtl', 'carcass_composition', 'meat_quality', 'identified', 'commercial', 'finisher', 'cross', 'main_objective', 'current_study', 'confirm', 'fine_map', 'qtl', 'ssc_ssc', 'genotyping', 'increased', 'number', 'individual', 'marker', 'analyze_data', 'using', 'combined_linkage', 'linkage_disequilibrium', 'analysis', 'method', 'modified', 'version', 'method', 'excludes', 'linkage_disequilibrium', 'information', 'analysis', 'enabling', 'comparison', 'result', 'based', 'linkage', 'information', 'result', 'based', 'combined_linkage', 'linkage_disequilibrium', 'information', 'nine', 'additional', 'paternal_halfsib', 'family_genotyped', 'marker', 'resulting', 'total', 'animal_genotyped', 'marker', 'ssc_ssc', 'respectively', 'qtl_affecting', 'meat_color', 'ssc', 'confirmed', 'whereas', 'qtl_affecting', 'lm', 'weight', 'could_confirmed', 'combined_linkage', 'linkage_disequilibrium', 'analysis', 'resulted', 'identification', 'new', 'significant', 'effect', 'trait', 'chromosome'

In [9]:
from keybert import KeyBERT

In [10]:
doc = " ".join([" ".join(token) for token in abstract_tokenized])
doc

'previous study qtl carcass composition meat quality identified commercial finisher cross main objective current study confirm fine map qtl ssc ssc genotyping increased number individual marker analyze data using combined linkage linkage disequilibrium analysis method modified version method excludes linkage disequilibrium information analysis enabling comparison result based linkage information result based combined linkage linkage disequilibrium information nine additional paternal halfsib family genotyped marker resulting total animal genotyped marker ssc ssc respectively qtl affecting meat color ssc confirmed whereas qtl affecting lm weight could confirmed combined linkage linkage disequilibrium analysis resulted identification new significant effect trait chromosome heritabilities qtl effect ranged analysis contributed accurate positioning qtl characterized phenotypic effect however result showed even greater marker density required take full advantage linkage disequilibrium infor

In [11]:
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2))
print(keywords)

[('qtl meattype', 0.6647), ('genotype meat', 0.6383), ('meat phenotypic', 0.6104), ('trait meattype', 0.6039), ('meattype breed', 0.5792)]


#### TF-IDF Analysis

In [67]:
# Compute TF-IDF
tfidf_vec = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vec.fit_transform([" ".join(token) for token in bigram_token])

tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
tfidf_word = dict(zip(tfidf_vec.get_feature_names_out(), tfidf_scores))

# Sort highest ranked words
top_words = sorted(tfidf_word.items(), key=lambda x: x[1], reverse=True)[:10]
top_words_list, top_scores_list = zip(*top_words)

for word, score in top_words:
    print(f"{word}: {round(score, 4)}")

qtl: 0.0634
snp: 0.0497
trait: 0.0476
gene: 0.041
region: 0.0292
association: 0.027
ssc: 0.0269
associated: 0.0263
study: 0.0261
analysis: 0.0259


### Word2Vec Anlysis

In [68]:
# Define model Word2Vec model
word2vec_model = gensim.models.Word2Vec(
    vector_size=100,
    window=5,
    min_count=10)

# Build Vocabulary
word2vec_model.build_vocab(bigram_token)

# Train model
word2vec_model.train(bigram_token, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)


(529133, 726605)

In [69]:
# Dictionary to save values
most_similar_dict = {}

# Print and collect vales
for word in top_words:
    similar_words = word2vec_model.wv.most_similar(word[0], topn=20)
    print(f"{word[0]} → {', '.join([w[0] for w in similar_words])}")

    most_similar_dict[word[0]] = [(str(w[0]) + ":" + str(round(w[1], 4))) for w in similar_words]

# Create data-frame
df_test = pd.DataFrame.from_dict(most_similar_dict, orient='index')
df_test.index = range(1, len(df_test) + 1)
df_test.columns = [f"Similar {i+1}" for i in range(df_test.shape[1])]
df_test.insert(0, "Top Word", top_words_list)
df_test.insert(1, "TF-IDF Score", [round(score, 4) for score in top_scores_list])

df_test

qtl → qtls, mapped, detected, chromosome, suggestive, previously, reported, chromosomewide, scrofa, sus, oar, coincided, mb, chromosomal, confirmed, near, significant, identified, position, genomewise
snp → intron, exon, three, haplotype, singlenucleotide, two, revealed, single, nucleotide, five, flanking, showed, polymorphism, six, seven, bp, within, coding, sequence, untranslated
trait → affecting, quantitative, locus, carcass, fatness, growth, fertility, endocrine, quality, meat, related, bone, eggshell, influencing, udder, performance, affect, underlying, production, qtls
gene → candidate, positional, mutation, functional, involved, potential, bovine, pathway, plausible, several, region, coding, variant, associated, causal, porcine, promoter, function, novel, promising
region → identified, located, previously, reported, within, novel, mb, several, annotated, chromosome, vicinity, one, mapped, near, chromosomal, close, detected, harboring, confirmed, oar
association → genomewide, gw

Unnamed: 0,Top Word,TF-IDF Score,Similar 1,Similar 2,Similar 3,Similar 4,Similar 5,Similar 6,Similar 7,Similar 8,...,Similar 11,Similar 12,Similar 13,Similar 14,Similar 15,Similar 16,Similar 17,Similar 18,Similar 19,Similar 20
1,qtl,0.0634,qtls:0.9782,mapped:0.9724,detected:0.9317,chromosome:0.9312,suggestive:0.9239,previously:0.9229,reported:0.8938,chromosomewide:0.8863,...,oar:0.8797,coincided:0.8756,mb:0.8755,chromosomal:0.8727,confirmed:0.87,near:0.8668,significant:0.864,identified:0.8637,position:0.861,genomewise:0.8578
2,snp,0.0497,intron:0.9169,exon:0.9058,three:0.9053,haplotype:0.8999,singlenucleotide:0.8855,two:0.8835,revealed:0.8746,single:0.8709,...,flanking:0.8633,showed:0.8433,polymorphism:0.8403,six:0.8385,seven:0.8369,bp:0.8328,within:0.8298,coding:0.816,sequence:0.8066,untranslated:0.8009
3,trait,0.0476,affecting:0.9285,quantitative:0.9024,locus:0.8686,carcass:0.8546,fatness:0.8384,growth:0.8341,fertility:0.8252,endocrine:0.8002,...,related:0.7882,bone:0.7873,eggshell:0.7775,influencing:0.777,udder:0.7655,performance:0.7637,affect:0.7561,underlying:0.75,production:0.7474,qtls:0.7433
4,gene,0.041,candidate:0.9747,positional:0.9279,mutation:0.8651,functional:0.8483,involved:0.8476,potential:0.8346,bovine:0.8147,pathway:0.8001,...,region:0.7841,coding:0.7736,variant:0.7731,associated:0.7713,causal:0.7629,porcine:0.7618,promoter:0.7592,function:0.7555,novel:0.7523,promising:0.7452
5,region,0.0292,identified:0.965,located:0.9543,previously:0.9487,reported:0.9185,within:0.9146,novel:0.9121,mb:0.9089,several:0.9049,...,vicinity:0.8873,one:0.8861,mapped:0.8855,near:0.8813,chromosomal:0.8797,close:0.8788,detected:0.877,harboring:0.8651,confirmed:0.8645,oar:0.8618
6,association,0.027,genomewide:0.9416,gwas:0.9278,analysis:0.8952,conducted:0.8713,approach:0.8664,performed:0.8657,gwa:0.8579,perform:0.8557,...,revealed:0.8367,mapping:0.836,singlenucleotide:0.8358,genome:0.8188,method:0.8147,test:0.8114,carried:0.811,wide:0.8106,previous:0.8081,metaanalysis:0.8031
7,ssc,0.0269,scrofa:0.9156,sus:0.9087,mb:0.8723,chromosome:0.8643,cm:0.8572,position:0.8527,mapped:0.8521,sw:0.8498,...,detected:0.846,one:0.844,sscx:0.8438,qtls:0.8407,respectively:0.8391,significant:0.839,qtl:0.8363,found:0.8328,near:0.8285,confirmed:0.8284
8,associated,0.0263,significantly:0.891,also:0.8579,close:0.8355,harbouring:0.827,found:0.8256,several:0.82,positional:0.8143,bovine:0.8097,...,reported:0.7948,mutation:0.7946,promising:0.7924,promoter:0.7882,encoding:0.7877,dgat:0.7874,linked:0.7874,gamma:0.7858,receptor:0.7857,contains:0.7854
9,study,0.0261,objective:0.9257,identify:0.9189,detect:0.916,aim:0.912,gwas:0.8907,present:0.8837,conducted:0.8774,genomic:0.862,...,association:0.8514,genome:0.8487,performed:0.8402,approach:0.8382,map:0.8379,carried:0.8211,gwa:0.8187,previous:0.8179,fine:0.8176,genomewide:0.8131
10,analysis,0.0259,regression:0.9606,mapping:0.9538,linkage:0.9523,disequilibrium:0.9467,approach:0.9289,genomewide:0.9266,microsatellite:0.9261,marker:0.9179,...,nonparametric:0.9038,based:0.9035,test:0.9012,method:0.9003,covering:0.8962,association:0.8952,microsatellites:0.8941,regional:0.8893,combined:0.889,ldla:0.8887


### Phrase Matching Analysis

In [29]:
bi_unique = set()
bi_good_phrases = [[token for token in doc if "_" in token] for doc in bigram_token]
bi_good_phrases = [[token.replace("_", " ") for token in doc] for doc in bi_good_phrases]

for doc in bi_good_phrases:
    bi_unique.update(doc) 

print(f"Number of good phrases in the trait dictionary: {len(bi_unique)}")
print(bi_unique)

Number of good phrases in the trait dictionary: 3144
{'connective tissue', 'investigated whether', 'would useful', 'mean corpuscular', 'feed efficient', 'linecross halfsib', 'microsatellites covering', 'hock joint', 'different parity', 'broilerleghorn cross', 'holstein bull', 'contribute understanding', 'however little', 'cast capn', 'line divergently', 'large white', 'insight genetic', 'genetic variance', 'worm egg', 'limited number', 'day open', 'fi rfi', 'due low', 'buffalo cattle', 'little known', 'growth rate', 'mapping rhm', 'strongest association', 'economic importance', 'montbéliarde cow', 'white blood', 'body dimension', 'baseline erythroid', 'mutation underlying', 'japanese black', 'racing performance', 'ratio fcr', 'allelic effect', 'maternal stillbirth', 'affymetrix axiom', 'causative mutation', 'similar position', 'triglyceride level', 'multimarker regression', 'nguni cattle', 'classical linkage', 'may regulate', 'aimed investigate', 'interval ci', 'milk bhb', 'onset puber

In [30]:

matching_phrases = bi_unique & trait_phrases

# Count total matches
match_count = len(matching_phrases)


print(f"Total Exact Matches: {match_count}")
print(f"This represents {round(match_count/len(trait_phrases),4)*100}% out of all phrases in the dictionary.")
print("Matching Phrases:", matching_phrases)

Total Exact Matches: 116
This represents 0.51% out of all phrases in the dictionary.
Matching Phrases: {'connective tissue', 'serum lipid', 'bone mineral', 'body height', 'serum ige', 'hip width', 'carcass length', 'eye muscle', 'semimembranosus muscle', 'gestation length', 'mammary gland', 'ige level', 'body length', 'heart weight', 'litter size', 'ovarian follicle', 'semen volume', 'milk urea', 'oleic acid', 'chest width', 'chest circumference', 'ear size', 'triglyceride level', 'shank length', 'feed conversion', 'leaf fat', 'udder morphology', 'gizzard weight', 'nipple number', 'fat depth', 'bone density', 'milk protein', 'body size', 'comb mass', 'heart girth', 'ovulation rate', 'lactose yield', 'serum leptin', 'udder depth', 'skeletal muscle', 'adrenal gland', 'tibia length', 'loin eye', 'adipose tissue', 'body conformation', 'fat thickness', 'hemoglobin concentration', 'antibody response', 'scrotal circumference', 'backfat thickness', 'abdominal fat', 'milk fat', 'horn length', '