### Config

In [1]:
import json
import pandas as pd
import numpy as np
import os

In [2]:
import spacy
nlp_spacy = spacy.load("en_core_web_sm")

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gabrielvictorgomesferreira/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielvictorgomesferreira/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import gensim
from gensim.models.phrases import Phraser, Phrases

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [5]:
# Load the config file
with open('../config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

### Datasets

In [6]:
# Read trait data
file_name = "Trait dictionary.txt"
final_path = os.path.join(file_path, file_name) 

# Read trait dictionary file
with open(final_path, "r", encoding="utf-8") as file:
    trait_phrases = set([phrase.lower().strip() for phrase in file if phrase.strip()])

print(f"Number of good phrases in the trait dictionary: {len(trait_phrases)}")
print(trait_phrases)

Number of good phrases in the trait dictionary: 22719
{'elastic laminae morphology trait', 'intramuscular fat weight as percent of body', 'meat glucose-6-phosphate content', 'seminiferous tubule morphology', 'total milk mineral', 'ureter physiology', 'length of intestine affected by hirschsprung disease to total length of colon ratio', 'plasma ast activity level', 'number of capillaries per skeletal muscle cell', 'ovary diameter', 'chemokine physiology', 'sagittal suture morphology trait', 'abomasum size trait', 'longissimus muscle weight', 'quinine index', 'blood non-specified leukocyte count', 'tibia toughness', 'aortic wall morphological measurement', 'neuron mitochondrion area', 'thyroid gland wet', 'colostrum growth factor content', 'milk fatty acid c18:3(n-3) percentage', 'hair cell mechanoelectric transduction trait', 'mammillary body morphology trait', 'intestinal cell morphology trait', 'plasma antibody', 'calculated heart left ventricle deoxyribonucleic acid content', 'urine 

### Pre-Processing

In [29]:
# Set stop words
stop_words = set(stopwords.words('english'))

# Set lemmatizer
lemmatizer = WordNetLemmatizer()

# Read json file
file_name = "QTL_text.json"
final_path = os.path.join(file_path, file_name) 
df = pd.read_json(final_path)

# Pre Process
df_processed = df[['Abstract', 'Category']]
df_processed = df_processed[df_processed['Category'] == 1].reset_index(drop=True)

def nlp_preprocessing(abstract_tokenized):
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.lower()) # Lowercasing txt
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.translate(str.maketrans("", "", string.punctuation))) # Removing punctuations from the text
    # abstract_tokenized = abstract_tokenized.apply(lambda doc: re.sub(r"\d+", "", doc)) # Removing numbers from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token for token in word_tokenize(doc) if token not in stop_words]) # Tokenizing and removing stop words from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [lemmatizer.lemmatize(token, pos=wordnet.NOUN) for token in doc]) # Convert words to their base
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token.strip() for token in doc if token.strip() and len(token)>1]) # Removing extra space
    return abstract_tokenized



df_processed['abstract_nltk'] = nlp_preprocessing(df_processed['Abstract'])
abstract_tokenized = df_processed['abstract_nltk']
abstract_tokenized

0       [previous, study, qtl, carcass, composition, m...
1       [wholegenome, quantitative, trait, locus, qtl,...
2       [partial, genome, scan, using, microsatellite,...
3       [background, rate, pubertal, development, wean...
4       [previously, quantitative, trait, locus, qtl, ...
                              ...                        
1002    [copy, number, variation, cnvs, major, source,...
1003    [body, size, important, indicator, growth, hea...
1004    [genomewide, association, study, gwas, perform...
1005    [gastrointestinal, nematode, gin, infection, n...
1006    [animal, temperament, defined, consistent, beh...
Name: abstract_nltk, Length: 1007, dtype: object

In [30]:
abstract_clean = [" ".join(token) for token in abstract_tokenized]
abstract_clean[0]

'previous study qtl carcass composition meat quality identified commercial finisher cross main objective current study confirm fine map qtl ssc4 ssc11 genotyping increased number individual marker analyze data using combined linkage linkage disequilibrium analysis method modified version method excludes linkage disequilibrium information analysis enabling comparison result based linkage information result based combined linkage linkage disequilibrium information nine additional paternal halfsib family genotyped 18 marker resulting total 1855 animal genotyped 15 13 marker ssc4 ssc11 respectively qtl affecting meat color ssc4 confirmed whereas qtl affecting lm weight could confirmed combined linkage linkage disequilibrium analysis resulted identification new significant effect 14 trait chromosome heritabilities qtl effect ranged 18 132 analysis contributed accurate positioning qtl characterized phenotypic effect however result showed even greater marker density required take full advanta

### Phrase Extraction

#### gensim.models.phrases

In [31]:
## Bigram
bigram = Phraser(Phrases(abstract_tokenized, min_count=2, threshold=10))
bigram_token = [bigram[doc] for doc in abstract_tokenized]
bigram_text = [" ".join(token) for token in bigram_token]

## Trigram
trigram = Phraser(Phrases(bigram[abstract_tokenized], min_count=2, threshold=10))
trigram_token = [trigram[bigram[doc]] for doc in abstract_tokenized]
trigram_text = [" ".join(token) for token in trigram_token]

print(bigram_token)
print(bigram_text)

[['previous_study', 'qtl', 'carcass_composition', 'meat_quality', 'identified', 'commercial', 'finisher', 'cross', 'main_objective', 'current_study', 'confirm', 'fine_map', 'qtl', 'ssc4', 'ssc11', 'genotyping', 'increased', 'number', 'individual', 'marker', 'analyze_data', 'using', 'combined_linkage', 'linkage_disequilibrium', 'analysis', 'method', 'modified', 'version', 'method', 'excludes', 'linkage_disequilibrium', 'information', 'analysis', 'enabling', 'comparison', 'result', 'based', 'linkage', 'information', 'result', 'based', 'combined_linkage', 'linkage_disequilibrium', 'information', 'nine', 'additional', 'paternal_halfsib', 'family_genotyped', '18', 'marker', 'resulting', 'total', '1855', 'animal_genotyped', '15', '13', 'marker', 'ssc4', 'ssc11', 'respectively', 'qtl_affecting', 'meat_color', 'ssc4', 'confirmed', 'whereas', 'qtl_affecting', 'lm', 'weight', 'could_confirmed', 'combined_linkage', 'linkage_disequilibrium', 'analysis', 'resulted', 'identification', 'new', 'signif

#### Next method

In [32]:
from rake_nltk import Rake

In [33]:
r = Rake(stopwords=stop_words, punctuations=string.punctuation, min_length=1, max_length=3)
r.extract_keywords_from_sentences(df_processed['Abstract'])
print(r.get_ranked_phrases_with_scores())

[(9.0, 'transmissible spongiform encephalopathy'), (9.0, 'spp1c .- 430g'), (9.0, 'spp1c .- 1301g'), (9.0, 'spp1c .- 1251c'), (9.0, 'spp1c .* 40a'), (9.0, 'sexually dimorphic ungulates'), (9.0, 'noire du velay'), (9.0, 'medial suspensory ligament'), (9.0, 'lolium arundinaceum schreb'), (9.0, 'keyhole lymphet haemocyanin'), (9.0, 'intensively managed dairies'), (9.0, 'http :// www'), (9.0, 'http :// www'), (9.0, 'http :// bovineqtl'), (9.0, 'expressing f4 fimbria'), (9.0, 'cytochrome p450 a19'), (8.8, 'north american jurisdictions'), (8.8, 'mycobacterium avium subsp'), (8.8, 'mycobacterium avium ssp'), (8.75, 'depigmented iris sectors'), (8.666666666666666, 'maximization lasso algorithm'), (8.666666666666666, 'gastric inhibitory polypeptide'), (8.666666666666666, 'abomasal lymph node'), (8.666666666666666, 'abomasal lymph node'), (8.659574468085106, 'bovine spongiform encephalopathy'), (8.607476635514018, 'commercial packing plant'), (8.586538461538462, 'packed cell volumes'), (8.5833333

In [34]:
def nlp_preprocessing(abstract_tokenized):
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.lower()) # Lowercasing txt
    abstract_tokenized = abstract_tokenized.apply(lambda doc: doc.translate(str.maketrans("", "", string.punctuation))) # Removing punctuations from the text
    # abstract_tokenized = abstract_tokenized.apply(lambda doc: re.sub(r"\d+", "", doc)) # Removing numbers from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token for token in word_tokenize(doc) if token not in stop_words]) # Tokenizing and removing stop words from the text
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [lemmatizer.lemmatize(token, pos=wordnet.NOUN) for token in doc]) # Convert words to their base
    abstract_tokenized = abstract_tokenized.apply(lambda doc: [token.strip() for token in doc if token.strip() and len(token)>1]) # Removing extra space
    return abstract_tokenized

In [35]:
key_phrases_rake = []
for score, key_phrase in r.get_ranked_phrases_with_scores():
    if score > 5:
        key_phrases_rake.append(key_phrase)

key_phrases_rake = nlp_preprocessing(pd.Series(key_phrases_rake))
key_phrases_rake = key_phrases_rake.apply(lambda phrase: "_".join(phrase))
key_phrases_rake = pd.Series(key_phrases_rake.tolist())
key_phrases_rake

0        transmissible_spongiform_encephalopathy
1                                     spp1c_430g
2                                    spp1c_1301g
3                                    spp1c_1251c
4                                      spp1c_40a
                          ...                   
12352                                 tick_count
12353                            cm_respectively
12354                       detected_marker_lead
12355                            measurement_day
12356                                     fdr_10
Length: 12357, dtype: object

In [36]:
pd.concat([abstract_tokenized, key_phrases_rake], ignore_index=True)

0        [previous, study, qtl, carcass, composition, m...
1        [wholegenome, quantitative, trait, locus, qtl,...
2        [partial, genome, scan, using, microsatellite,...
3        [background, rate, pubertal, development, wean...
4        [previously, quantitative, trait, locus, qtl, ...
                               ...                        
13359                                           tick_count
13360                                      cm_respectively
13361                                 detected_marker_lead
13362                                      measurement_day
13363                                               fdr_10
Length: 13364, dtype: object

In [37]:
abstract_tokenized

0       [previous, study, qtl, carcass, composition, m...
1       [wholegenome, quantitative, trait, locus, qtl,...
2       [partial, genome, scan, using, microsatellite,...
3       [background, rate, pubertal, development, wean...
4       [previously, quantitative, trait, locus, qtl, ...
                              ...                        
1002    [copy, number, variation, cnvs, major, source,...
1003    [body, size, important, indicator, growth, hea...
1004    [genomewide, association, study, gwas, perform...
1005    [gastrointestinal, nematode, gin, infection, n...
1006    [animal, temperament, defined, consistent, beh...
Name: abstract_nltk, Length: 1007, dtype: object

In [38]:
# Initialize RAKE with stopwords and punctuation
r = Rake(stopwords=stop_words, punctuations=string.punctuation, min_length=1, max_length=3)

# Extract RAKE key phrases per document
df_processed["key_phrases_rake"] = df_processed["Abstract"].apply(
    lambda text: r.extract_keywords_from_text(text) or [phrase for score, phrase in r.get_ranked_phrases_with_scores() if score > 5]
)

# Preprocess key phrases and merge with documents
df_processed["processed_key_phrases"] = df_processed["key_phrases_rake"].apply(lambda phrases: ["_".join(nlp_preprocessing(pd.Series([phrase]))[0]) for phrase in phrases])

# Merge key phrases back into original documents
df_processed["abstract_nltk_plus_keyphrases_token"] = df_processed.apply(
    lambda row: row["abstract_nltk"] + row["processed_key_phrases"], axis=1
)

df_processed['abstract_nltk_plus_keyphrases_text'] = df_processed['abstract_nltk_plus_keyphrases_token'].apply(lambda token: " ".join(token))
df_processed

Unnamed: 0,Abstract,Category,abstract_nltk,key_phrases_rake,processed_key_phrases,abstract_nltk_plus_keyphrases_token,abstract_nltk_plus_keyphrases_text
0,"In a previous study, QTL for carcass compositi...",1,"[previous, study, qtl, carcass, composition, m...","[take full advantage, new significant effects,...","[take_full_advantage, new_significant_effect, ...","[previous, study, qtl, carcass, composition, m...",previous study qtl carcass composition meat qu...
1,A whole-genome quantitative trait locus (QTL) ...,1,"[wholegenome, quantitative, trait, locus, qtl,...","[entire porcine genome, based nonparametric ap...","[entire_porcine_genome, based_nonparametric_ap...","[wholegenome, quantitative, trait, locus, qtl,...",wholegenome quantitative trait locus qtl scan ...
2,A partial genome scan using microsatellite mar...,1,"[partial, genome, scan, using, microsatellite,...","[porcine resource population, marker loci sw13...","[porcine_resource_population, marker_locus_sw1...","[partial, genome, scan, using, microsatellite,...",partial genome scan using microsatellite marke...
3,BACKGROUND: The rate of pubertal development a...,1,"[background, rate, pubertal, development, wean...","[usmarc resource population, specific hydroxys...","[usmarc_resource_population, specific_hydroxys...","[background, rate, pubertal, development, wean...",background rate pubertal development weaning e...
4,"Previously, quantitative trait loci (QTL) for ...",1,"[previously, quantitative, trait, locus, qtl, ...","[quantitative trait loci, loin eye area, janus...","[quantitative_trait_locus, loin_eye_area, janu...","[previously, quantitative, trait, locus, qtl, ...",previously quantitative trait locus qtl tenthr...
...,...,...,...,...,...,...,...
1002,Copy number variations (CNVs) are a major sour...,1,"[copy, number, variation, cnvs, major, source,...","[pathway enrichment analyses, ovine infinium h...","[pathway_enrichment_analysis, ovine_infinium_h...","[copy, number, variation, cnvs, major, source,...",copy number variation cnvs major source struct...
1003,Body size is an important indicator of growth ...,1,"[body, size, important, indicator, growth, hea...","[wide association studies, gene transcription ...","[wide_association_study, gene_transcription_ac...","[body, size, important, indicator, growth, hea...",body size important indicator growth health sh...
1004,A genome-wide association study (GWAS) was per...,1,"[genomewide, association, study, gwas, perform...","[wide association study, somatic cell score, i...","[wide_association_study, somatic_cell_score, i...","[genomewide, association, study, gwas, perform...",genomewide association study gwas performed id...
1005,Gastrointestinal nematode (GIN) infections hav...,1,"[gastrointestinal, nematode, gin, infection, n...","[underlying genetic mechanisms, population str...","[underlying_genetic_mechanism, population_stru...","[gastrointestinal, nematode, gin, infection, n...",gastrointestinal nematode gin infection negati...


In [40]:
"entire_porcine_genome" in (df_processed["abstract_nltk_plus_keyphrases_text"][1])

True

In [41]:
df_processed["abstract_nltk_plus_keyphrases_token"]

0       [previous, study, qtl, carcass, composition, m...
1       [wholegenome, quantitative, trait, locus, qtl,...
2       [partial, genome, scan, using, microsatellite,...
3       [background, rate, pubertal, development, wean...
4       [previously, quantitative, trait, locus, qtl, ...
                              ...                        
1002    [copy, number, variation, cnvs, major, source,...
1003    [body, size, important, indicator, growth, hea...
1004    [genomewide, association, study, gwas, perform...
1005    [gastrointestinal, nematode, gin, infection, n...
1006    [animal, temperament, defined, consistent, beh...
Name: abstract_nltk_plus_keyphrases_token, Length: 1007, dtype: object

#### TF-IDF Analysis

In [42]:
# Compute TF-IDF
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(df_processed["abstract_nltk_plus_keyphrases_text"])

tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
tfidf_word = dict(zip(tfidf_vec.get_feature_names_out(), tfidf_scores))

# Sort highest ranked words
top_words = sorted(tfidf_word.items(), key=lambda x: x[1], reverse=True)[:10]
top_words_list, top_scores_list = zip(*top_words)

for word, score in top_words:
    print(f"{word}: {round(score, 4)}")

qtl: 0.0567
trait: 0.0427
snp: 0.0426
gene: 0.037
region: 0.0262
association: 0.0242
associated: 0.0237
study: 0.0235
analysis: 0.0232
effect: 0.0231


### Word2Vec Anlysis

In [45]:
# Define model Word2Vec model
word2vec_model = gensim.models.Word2Vec(
    vector_size=100,
    window=5,
    min_count=10)

# Build Vocabulary
word2vec_model.build_vocab(df_processed["abstract_nltk_plus_keyphrases_token"])

# Train model
word2vec_model.train(df_processed["abstract_nltk_plus_keyphrases_token"], total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)


(555191, 829860)

In [46]:
# Dictionary to save values
most_similar_dict = {}

# Print and collect vales
for word in top_words:
    similar_words = word2vec_model.wv.most_similar(word[0], topn=20)
    print(f"{word[0]} → {', '.join([w[0] for w in similar_words])}")

    most_similar_dict[word[0]] = [(str(w[0]) + ":" + str(round(w[1], 4))) for w in similar_words]

# Create data-frame
df_test = pd.DataFrame.from_dict(most_similar_dict, orient='index')
df_test.index = range(1, len(df_test) + 1)
df_test.columns = [f"Similar {i+1}" for i in range(df_test.shape[1])]
df_test.insert(0, "Top Word", top_words_list)
df_test.insert(1, "TF-IDF Score", [round(score, 4) for score in top_scores_list])

df_test

qtl → qtls, mapped, previously, detected, reported, suggestive, locus, identified, chromosome, one, chromosomewide, 13, region, chromosomal, 14, several, putative, four, coincided, ssc
trait → affecting, quantitative, locus, growth, fatness, fertility, economically, influencing, underlying, meat, carcass, affect, quality, eggshell, production, qtls, related, important, bone, endocrine
snp → intron, five, singlenucleotide, exon, single, nucleotide, three, showed, haplotype, polymorphism, flanking, two, revealed, six, seven, within, array, ga, bovine, coding
gene → candidate, positional, mutation, functional, involved, bovine, several, variant, potential, region, plausible, coding, pathway, porcine, novel, promoter, promising, causative, known, reported
region → identified, previously, located, novel, reported, mb, one, within, detected, several, mapped, bovine, positional, close, chromosome, found, candidate, mutation, putative, chromosomal
association → gwas, genomewide, analysis, gwa,

Unnamed: 0,Top Word,TF-IDF Score,Similar 1,Similar 2,Similar 3,Similar 4,Similar 5,Similar 6,Similar 7,Similar 8,...,Similar 11,Similar 12,Similar 13,Similar 14,Similar 15,Similar 16,Similar 17,Similar 18,Similar 19,Similar 20
1,qtl,0.0567,qtls:0.9567,mapped:0.937,previously:0.9293,detected:0.9203,reported:0.8984,suggestive:0.8893,locus:0.8824,identified:0.8763,...,chromosomewide:0.8507,13:0.8425,region:0.8412,chromosomal:0.8379,14:0.8371,several:0.835,putative:0.8288,four:0.8262,coincided:0.825,ssc:0.8159
2,trait,0.0427,affecting:0.9374,quantitative:0.9067,locus:0.877,growth:0.869,fatness:0.8465,fertility:0.8464,economically:0.8411,influencing:0.814,...,carcass:0.7991,affect:0.7949,quality:0.7881,eggshell:0.7827,production:0.7808,qtls:0.7781,related:0.7751,important:0.7728,bone:0.772,endocrine:0.7681
3,snp,0.0426,intron:0.9281,five:0.9278,singlenucleotide:0.9232,exon:0.9175,single:0.9038,nucleotide:0.9013,three:0.8991,showed:0.8962,...,flanking:0.8851,two:0.8812,revealed:0.8738,six:0.8722,seven:0.8625,within:0.8602,array:0.8411,ga:0.8371,bovine:0.8348,coding:0.8319
4,gene,0.037,candidate:0.975,positional:0.9419,mutation:0.8856,functional:0.8466,involved:0.8374,bovine:0.8347,several:0.831,variant:0.8241,...,plausible:0.8078,coding:0.8007,pathway:0.7995,porcine:0.7984,novel:0.7967,promoter:0.7887,promising:0.7865,causative:0.7814,known:0.7803,reported:0.7766
5,region,0.0262,identified:0.9681,previously:0.9477,located:0.935,novel:0.9195,reported:0.9143,mb:0.912,one:0.9089,within:0.9052,...,mapped:0.878,bovine:0.8776,positional:0.8763,close:0.8728,chromosome:0.8699,found:0.8685,candidate:0.8673,mutation:0.8657,putative:0.8647,chromosomal:0.8628
6,association,0.0242,gwas:0.9388,genomewide:0.9362,analysis:0.9053,gwa:0.8908,conducted:0.8811,singlenucleotide:0.8716,perform:0.867,study:0.8669,...,result:0.8561,approach:0.8532,regression:0.8406,imputed:0.8354,wide:0.8276,metaanalysis:0.8271,previous:0.8266,mapping:0.8242,array:0.8239,highdensity:0.8215
7,associated,0.0237,also:0.911,significantly:0.8823,found:0.8652,several:0.8396,one:0.8261,qtls:0.824,fabp4:0.8238,near:0.8223,...,located:0.8123,coincided:0.8068,close:0.8029,receptor:0.8,bovine:0.7971,promoter:0.7935,promising:0.7927,identified:0.7866,strongly:0.786,plausible:0.7843
8,study,0.0235,objective:0.9406,identify:0.9335,present:0.9223,detect:0.9218,aim:0.9181,gwas:0.8873,conducted:0.887,previous:0.8847,...,perform:0.8494,genome:0.8453,approach:0.8398,map:0.8384,scan:0.8365,gwa:0.8343,performed:0.8319,genomic:0.8275,genomewide:0.8262,investigate:0.8118
9,analysis,0.0232,regression:0.9568,genomewide:0.9473,disequilibrium:0.943,mapping:0.9401,linkage:0.9354,approach:0.9232,gwa:0.918,gwas:0.9113,...,test:0.9104,marker:0.9059,association:0.9053,microsatellite:0.9049,software:0.9047,performed:0.8949,combined:0.8933,conducted:0.8887,perform:0.8873,revealed:0.8848
10,effect,0.0231,additive:0.9485,genomewise:0.9455,chromosomewise:0.9413,nominal:0.938,respectively:0.9373,24:0.9274,ssc2:0.9271,window:0.9254,...,15:0.9231,21:0.9219,25:0.9215,010:0.9208,reached:0.9206,dominance:0.9206,11:0.9194,chromosomewide:0.9193,block:0.9178,fdr:0.9175


### Phrase Matching Analysis

In [50]:
ngram_unique = set()
ngram_good_phrases = [[token for token in doc if "_" in token] for doc in df_processed["abstract_nltk_plus_keyphrases_token"]]
ngram_good_phrases = [[token.replace("_", " ") for token in doc] for doc in ngram_good_phrases]

for doc in ngram_good_phrases:
    ngram_unique.update(doc) 

print(f"Number of good phrases in the trait dictionary: {len(ngram_unique)}")
print(ngram_unique)

Number of good phrases in the trait dictionary: 8859
{'utilize natural variation', 'candidate gene research', 'respiratory disease trait', 'haemonchus contortus infestation', 'porcine myog gene', 'whereas logistic regression', 'pig affect growth', 'like meat quality', 'identify novel variant', 'layer chicken line', '76 microsatellite marker', 'cause serious problem', 'trait data set', 'adipose tissue mass', 'high production potential', 'respiratory health score', 'provide new information', 'latter block', 'twinning rate qtl', 'holstein bull population', 'baseline erythroid trait', 'based selection strategy', 'milk lgb content', 'commercial poultry flock', 'skeletal muscle', 'genetic variability underlying', 'fresh sperm motility', '96 baluchi sheep', 'improving semen trait', 'wide linkage analysis', 'domestication process relies', 'feed intake', 'relevant pathway linked', 'trigger cell autophagy', 'validated 54 qtl', 'cox transformation method', 'myocyte enhancer factor', 'ovine hsp90a

In [51]:

matching_phrases = ngram_unique & trait_phrases

# Count total matches
match_count = len(matching_phrases)


print(f"Total Exact Matches: {match_count}")
print(f"This represents {round(match_count/len(trait_phrases),4)*100}% out of all phrases in the dictionary.")
print("Matching Phrases:", matching_phrases)

Total Exact Matches: 101
This represents 0.44% out of all phrases in the dictionary.
Matching Phrases: {'thigh muscle weight', 'subcutaneous fat thickness', 'mammary gland development', 'somatic cell score', 'fatty acid', 'fertility trait', 'lipid metabolism', 'milk urea content', 'milk fat concentration', 'body weight', 'abdominal fat pad', 'nervous system development', 'feeding behavior', 'cd8 cell', 'cannon bone circumference', 'loin eye area', 'lumbar vertebra', 'white fat cell', 'milk fat yield', 'skeletal muscle', 'feeder per day', 'milk yield', 'net feed efficiency', 'meat trait', 'platelet distribution width', 'pulmonary artery pressure', 'milk somatic cell', 'skeletal muscle cell', 'corpus luteum number', 'body size trait', 'male fertility trait', 'body weight gain', 'antibody response', 'bone mineral content', 'female fertility', 'eye muscle depth', 'milk protein', 'plasma antibody level', 'milk protein concentration', 'milk protein yield', 'egg weight', 'back fat thickness',

In [56]:
from fuzzywuzzy import fuzz



In [71]:
count = 0
for ngram in ngram_unique:
    for trait_phrase in trait_phrases:
        similarity = fuzz.ratio(ngram, trait_phrase)
        if similarity > 80:
            count += 1
            print(f"{count} Similarity: {round(similarity, 4)} --> {ngram} -- is similar to -- {trait_phrase}")

1 Similarity: 82 --> respiratory disease trait -- is similar to -- respiratory system trait
2 Similarity: 85 --> adipose tissue mass -- is similar to -- adipose tissue
3 Similarity: 81 --> milk lgb content -- is similar to -- milk wap content
4 Similarity: 88 --> milk lgb content -- is similar to -- milk btn content
5 Similarity: 88 --> milk lgb content -- is similar to -- milk cla content
6 Similarity: 81 --> milk lgb content -- is similar to -- milk fat content
7 Similarity: 81 --> milk lgb content -- is similar to -- milk xor content
8 Similarity: 86 --> skeletal muscle -- is similar to -- skeletal muscle size
9 Similarity: 81 --> skeletal muscle -- is similar to -- skeletal muscle length
10 Similarity: 86 --> skeletal muscle -- is similar to -- skeletal muscle cell
11 Similarity: 86 --> skeletal muscle -- is similar to -- skeletal muscle mass
12 Similarity: 100 --> skeletal muscle -- is similar to -- skeletal muscle
13 Similarity: 83 --> skeletal muscle -- is similar to -- skeletal

KeyboardInterrupt: 

In [None]:
###### Diclaimer ###### This code was generated by ChatGPT. Pronpt: "Help me to apply a similar comparasion logic as the above fuzzy match logic but levraging TF-IDF instead."

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert lists into documents (needed for TF-IDF)
all_phrases = list(ngram_unique) + list(trait_phrases)  # Combine both lists

# Compute TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_phrases)

# Compute cosine similarity between ngrams (first set) and trait_phrases (second set)
ngram_vectors = tfidf_matrix[:len(ngram_unique)]  # First N documents are ngram_unique
trait_vectors = tfidf_matrix[len(ngram_unique):]  # Last N documents are trait_phrases

# Compute similarity scores
similarity_matrix = cosine_similarity(ngram_vectors, trait_vectors)

# Iterate through matches and print high-scoring pairs
threshold = 0.965  # Adjust for stricter or looser matching
count = 0
for i, ngram in enumerate(ngram_unique):
    for j, trait_phrase in enumerate(trait_phrases):
        similarity = similarity_matrix[i, j]
        if similarity > threshold:
            count += 1
            print(f"{count} Similarity: {round(similarity, 2)} --> {ngram} -- is similar to -- {trait_phrase}")
            



1 Similarity: 1.0 --> skeletal muscle -- is similar to -- skeletal muscle
2 Similarity: 1.0 --> total serum ige -- is similar to -- serum total ige
3 Similarity: 1.0 --> body weight gain -- is similar to -- body weight gain
4 Similarity: 1.0 --> leg muscle weight -- is similar to -- leg muscle weight
5 Similarity: 1.0 --> red blood cell -- is similar to -- red blood cell
6 Similarity: 1.0 --> cd4 cd8 -- is similar to -- cd4/cd8
7 Similarity: 1.0 --> eye muscle depth -- is similar to -- eye muscle depth
8 Similarity: 1.0 --> milk fat yield -- is similar to -- milk fat yield
9 Similarity: 1.0 --> female fertility -- is similar to -- female fertility
10 Similarity: 1.0 --> egg production trait -- is similar to -- egg production trait
11 Similarity: 1.0 --> packed cell volume -- is similar to -- packed cell volume
12 Similarity: 1.0 --> meat trait -- is similar to -- meat trait
13 Similarity: 1.0 --> mammary gland morphology -- is similar to -- mammary gland morphology
14 Similarity: 1.0 -