In [1]:
import os  
import sys  
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  


In [2]:
corpus=["A multi-context CNN ensemble for small lesion detection",
    "The impact of machine learning on patient care: A systematic review ",
    "The role of medical smartphone apps in clinical decision-support: A literature review",
    "Pharmacological therapy selection of type 2 diabetes based on the SWARA and modified MULTIMOORA methods under a fuzzy environment"]

In [8]:
vectorizer=CountVectorizer()         #This class converts the words in the text into a word frequency matrix. The matrix element a[I][j] represents the word frequency of j under the class I text 
transformer=TfidfTransformer()       #The class counts the tf-idf weight of each word  
X=vectorizer.fit_transform(corpus)   #Converts text to word frequency matrix
tfidf=transformer.fit_transform(X)   #count tf-idf，  
word=vectorizer.get_feature_names()  #Gets all the words in the word bag model
weight=tfidf.toarray()               #The tf-idf matrix is extracted, and element a[I][j] represents the tf-idf weight of word j in class I text 
for i in range(len(weight)):         #Print the tf-idf word weight for each type of text
    print("-------Here print No.",i,u"kind of text tf-idf weight------" )
    #for j in range(len(word)):  
    print(list(zip(word,weight[i])))

-------Here print No. 0 kind of text tf-idf weight------
[('and', 0.0), ('apps', 0.0), ('based', 0.0), ('care', 0.0), ('clinical', 0.0), ('cnn', 0.3535533905932738), ('context', 0.3535533905932738), ('decision', 0.0), ('detection', 0.3535533905932738), ('diabetes', 0.0), ('ensemble', 0.3535533905932738), ('environment', 0.0), ('for', 0.3535533905932738), ('fuzzy', 0.0), ('impact', 0.0), ('in', 0.0), ('learning', 0.0), ('lesion', 0.3535533905932738), ('literature', 0.0), ('machine', 0.0), ('medical', 0.0), ('methods', 0.0), ('modified', 0.0), ('multi', 0.3535533905932738), ('multimoora', 0.0), ('of', 0.0), ('on', 0.0), ('patient', 0.0), ('pharmacological', 0.0), ('review', 0.0), ('role', 0.0), ('selection', 0.0), ('small', 0.3535533905932738), ('smartphone', 0.0), ('support', 0.0), ('swara', 0.0), ('systematic', 0.0), ('the', 0.0), ('therapy', 0.0), ('type', 0.0), ('under', 0.0)]
-------Here print No. 1 kind of text tf-idf weight------
[('and', 0.0), ('apps', 0.0), ('based', 0.0), ('car

In [11]:
import nltk
import math
import string

from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import*

In [12]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sallyqian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sallyqian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
text1 = "A multi-context CNN ensemble for small lesion detection"
text2 = "The impact of machine learning on patient care: A systematic review "
text3 = "A novel method of motor imagery classification using eeg signal"

In [19]:
def get_tokens(text):
    lower = text.lower()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    no_punctuation = lower.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)

    return tokens

In [20]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))

    return stemmed

In [21]:
def tf(word, count):
    return count[word] / sum(count.values())
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
def idf(word, count_list):
    return math.log(len(count_list)) / (1 + n_containing(word, count_list))
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

In [22]:
def count_term(text):
    tokens = get_tokens(text)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed = stem_tokens(filtered, stemmer)
    count = Counter(stemmed)
    return count

def main():
    texts = [text1, text2, text3]
    countlist = []
    for text in texts:
        countlist.append(count_term(text))
    for i, count in enumerate(countlist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, count, countlist) for word in count}
        sorted_words = sorted(scores.items(), key = lambda x: x[1], reverse=True)
        for word, score in sorted_words[:5]:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

if __name__ == "__main__":
    main()

Top words in document 1
	Word: multicontext, TF-IDF: 0.09155
	Word: cnn, TF-IDF: 0.09155
	Word: ensembl, TF-IDF: 0.09155
	Word: small, TF-IDF: 0.09155
	Word: lesion, TF-IDF: 0.09155
Top words in document 2
	Word: impact, TF-IDF: 0.07847
	Word: machin, TF-IDF: 0.07847
	Word: learn, TF-IDF: 0.07847
	Word: patient, TF-IDF: 0.07847
	Word: care, TF-IDF: 0.07847
Top words in document 3
	Word: novel, TF-IDF: 0.06866
	Word: method, TF-IDF: 0.06866
	Word: motor, TF-IDF: 0.06866
	Word: imageri, TF-IDF: 0.06866
	Word: classif, TF-IDF: 0.06866
