# Article Labeling & Lexical Fields Finding

In [75]:
import pandas as pd
import numpy as np
import sys
import re
from tqdm import tqdm
import pickle

## Add cprofile for evaulation of a function's speed

In [2]:
import cProfile,pstats, io
def profile(fct):
    """ a decorator for the function 
        use by writing @profile before any function that needs evaluation"""
    def inner(*args,**kwargs):
        pr = cProfile.Profile()
        pr.enable()
        retval = fct(*args,**kwargs)
        s=i0.StringIO()
        sortBy = 'cumulative'
        ps = pstats.Stats(pr,stream = s).sort_stats(sortBy)
        ps.print_stats()
        print (s.getvalue())
        return retval

## Download Unlabelled articles

In [3]:
import json

raw_json_data = []
with open('./data/20200420_20200714_business_articles.json') as f:
    for line in f:
        raw_json_data.append(json.loads(line))


In [4]:
print ("data type",type (raw_json_data))
print ("json",type (raw_json_data[0]))
print ("keys",raw_json_data[0].keys())
print ("length", len(raw_json_data))
#print (raw_json_data[0])

data type <class 'list'>
json <class 'dict'>
keys dict_keys(['published', 'link', 'message', 'Feed', 'title', '@version', 'author', '@timestamp', 'full-text', 'type'])
length 416307


## Fetching Company Names & Related Names(52 companies)

In [5]:
# fetching company names (52 companies)
df = pd.read_excel (r'./data/comapny_name-related_words.xlsx', header = None)
# Lower
df[0] = [row[0].lower() for index, row in df.iterrows()] 
# Split company name and related names
split = np.array([row[0].split(";") for index, row in df.iterrows()])
df["company_name"] = split[:,0]
df["related_name"] = split[:,1]
df.drop(columns = [0], inplace=True)
# build dictionary of related name of companies
dict_companies = {}
company_names = df["company_name"].unique()
for name in company_names:
    df_tmp = df[df["company_name"] == name]
    dict_companies[name] = list(df_tmp["related_name"])
dict_companies.keys()

dict_keys(['21st century fox', 'activision blizzard', 'adobe ', 'advanced micro devices', 'akamai technologies', 'akamai tecnologies', 'alexion pharmaceuticals', 'amazon', 'american airlines group', 'amgen', 'analog devices', 'apple', 'autodesk', 'automatic data processing', 'baidu', 'bed bath & beyond', 'biogen', 'ca technologies', 'celgene', 'cerner', 'cisco ', 'cognizant', 'comcast', 'discovery communications', 'dish network', 'ebay', 'electronic arts', 'equinix', 'expeditors international', 'facebook', 'alphabet', 'intel', 'liberty global', 'liberty interactive', 'linear technology', 'marriott international', 'mattle', 'mattel', 'mckesson ', 'mckesson', 'microsoft', 'netflix', 'nvidia', 'paypal', 'qualcomm', 'starbucks', 'stericycle', 'tesla motors', 'texas instruments', 'the priceline group', 'universal display ', 'universal display'])

## Extracting url, title & full_text of each article:

In [9]:
urls = list()
plain_texts = list()
titles = list()
labels = list()

min_article_size = 2000
for article in raw_json_data:
    plain_text = article.get('full-text')
    title = article.get('title')
    url = article.get('link')
    if (plain_text and "Article `download()` failed" != plain_text[:27] and "Please enable cookies" != plain_text[:21] and len(plain_text)>min_article_size):
        plain_texts.append(plain_text)
        urls.append(url)
        titles.append(title)
        labels.append(list())
       

## Build DataFrame with extacted data

In [10]:
#Statistics
# 358192 removing "Article `download()` failed" 
# 340987 removing "Article `download()` failed" and "Please enable cookies"
# 215039 removing "Article `download()` failed" and "Please enable cookies" and size<min_article_size = 2000
data = np.array([urls,titles, plain_texts, labels]).T
columns=["url", "title", "plain_text", "label"]
df_articles = pd.DataFrame(data=data, columns=columns)

  """


In [11]:
df_articles.tail()

Unnamed: 0,url,title,plain_text,label
215034,http://rssfeeds.usatoday.com/~/t/0/0/usatodayc...,Michigan partygoers test positive for COVID-19...,Michigan partygoers test positive for COVID-19...,[]
215035,https://www.washingtontimes.com/news/2020/jul/...,Coast Guard officials decline to testify on ra...,"NEW LONDON, Conn. (AP) - A planned congression...",[]
215036,https://www.denverpost.com/2020/07/08/united-a...,"United Airlines will slash nearly 36,000 jobs ...",United Airlines plans to furlough as many as 3...,[]
215037,https://www.washingtontimes.com/news/2020/jul/...,The Latest: Pence says CDC will issue guidance...,WASHINGTON - Vice President Mike Pence says th...,[]
215038,https://www.washingtontimes.com/news/2020/jul/...,US rejects nearly all Chinese claims in South...,WASHINGTON (AP) - The Trump administration esc...,[]


## Cleaning full_text of articles

In [12]:
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.tokenize import word_tokenize

In [None]:
# Remove every non-letter/number character
#n_articles = 10000
#df_cleaned = df_articles.head(n_articles).copy(deep= True)
df_cleaned = df_articles.copy(deep= True)
for index, row in df_cleaned.iterrows():
    row["plain_text"] = row["plain_text"].lower()
    row["plain_text"]= re.sub(r'\s+', ' ', row["plain_text"])
    #[:punct:], ,[^0-9], [^a-z]
    #row["plain_text"] = re.sub("[^a-z],[^:punct:],[^0-9]", ' ', row["plain_text"])
    row["plain_text"] = re.sub("[^a-z0-9]", ' ', row["plain_text"])
    #row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0][:100]

## Find Stop Words & Removing them from plain text

In [31]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [32]:
# Remove all stop words from plain text
for index, row in df_cleaned.iterrows():
    for stop_word in stop_words:
        row["plain_text"] = re.sub(' '+stop_word+' ', ' ', row["plain_text"])
    row["plain_text"] = re.sub(r'\s+', ' ', row["plain_text"])
df_cleaned["plain_text"][0]

'eliminated masterchef contestant harry foster hit back unfair criticism judge melissa leong show first female judge 40 faced barrage trolling haters taking aim everything behaviour set fashion sense despite eliminated tuesday night episode harry nothing good things say melbourne based food writer could truth eliminated masterchef australia contestant harry foster pictured hit back unfair criticism judge melissa leong queen love harry told huffpost australia energetic passionate really vibrant asked accusations melissa rude biased show said could truth three judges received overwhelmingly positive response fans melissa copped backlash vocal minority queen show first female judge 40 faced barrage trolling haters taking aim everything behaviour set fashion sense many praised fashion sense positivity others claim waits feedback jock zonfrillo andy allen repeating chance melissa leong original idea masterchef continue wait others tell think dish one viewer tweeted another added new judge m

## Labeling Articles with Company Names 
### Check if Articles Talk of Companies

In [33]:
for index, row in df_cleaned.iterrows(): # initialize labels
    row['label'] = []
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["plain_text"]:
            row['label'].append(company)
        else:
            for related_name in dict_companies[company]:
                if related_name in row["plain_text"]:
                    row['label'].append(company)
                    break
df_cleaned["label"].head()

0                                  []
1                                  []
2    [advanced micro devices, nvidia]
3                                  []
4                             [apple]
Name: label, dtype: object

In [6]:
# Getting data from csv
PATH = "./data/"
file = "cleaned_articles_200k"
df_cleaned = pd.read_csv(PATH + file + ".csv") 

In [8]:
df_cleaned.head()

Unnamed: 0,url,title,plain_text,label
0,https://www.dailymail.co.uk/tvshowbiz/article-...,MasterChef's Harry Foster hits back at claims ...,eliminated masterchef contestant harry foster ...,[]
1,https://www.washingtontimes.com/news/2020/jun/...,"Protest arrests logjam tests NYC legal system,...",new york ap wave arrests new york city protest...,[]
2,https://www.dailymail.co.uk/news/article-83114...,Labour's Anneliese Dodds says she will REFUSE ...,a top shadow minister today said enough eviden...,"['advanced micro devices', 'nvidia']"
3,http://feeds.reuters.com/~r/Reuters/worldNews/...,Civil unrest rages in Minneapolis over raciall...,minneapolis reuters peaceful rallies gave way ...,[]
4,https://www.dailymail.co.uk/news/article-82734...,Australia 'beats the cr*p' out of coronavirus ...,australia beating c p coronavirus six states t...,['apple']


### Get number of articles with labels

In [7]:
labeled = 0
for index, row in df_cleaned.iterrows():
    if row["label"]:
        labeled +=1
print ("There are %d labeled articles in the %d articles of the corpus"%(labeled, len (df_cleaned["label"])))      

There are 215039 labeled articles in the 215039 articles of the corpus


## Count Number of Articles that each Company is Associated to.

In [9]:
# init
dict_count = {}
for company in company_names: dict_count[company]= 0
    
for index, row in df_cleaned.iterrows():
    for company in company_names:
        if company in row["label"]:
            dict_count[company]+=1
dict_count          

companies_w_articles = list()
for company in company_names:
    if dict_count[company]>0:
        companies_w_articles.append(company)
print ("there are %d companies with associated articles over the %d total companies"%(len(companies_w_articles),len(company_names)) )
#dict_count

there are 52 companies with associated articles over the 52 total companies


## Tf.Idf to get top 20 words for each company (that have articles related to them)

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [None]:
# Tf.Idf on Companies that have Associated Articles 
relevant_words_tfidf = {}
for company in tqdm(companies_w_articles): # for all companies in companies_w_articles

    #tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,3), binary = True)
    tfidf_vectorizer=TfidfVectorizer(stop_words = {'english'},ngram_range = (1,1))
    plain_text_list = list()
    company_article = ""
    for index, row in df_cleaned.iterrows():
        if company in row["label"]:
            company_article = company_article+ " "+ row["plain_text"]
            plain_text_list.append(row["plain_text"])
    
    plain_text_list.insert(0,company_article)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(plain_text_list)

    #Get the tf-idf scores for the words in the company article complication.
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 

    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
    df = df.sort_values(by=["tfidf"],ascending=False).head(40) # Take top 40 words
    
    relevant_words_tfidf[company] = list(zip(list(df.index),list(df["tfidf"])))
    #print (relevant_words_tfidf[company])

 23%|██▎       | 12/52 [41:22<5:56:30, 534.77s/it]

In [37]:
# load dictionary 
PATH = "./data/"
file = "relevant_words_tfidf_200k"
a_file = open(PATH + file + ".json", "r")
relevant_words_tfidf = json.load(a_file)
relevant_words_tfidf = dict(relevant_words_tfidf)

In [23]:
#relevant_words_tfidf['21st century fox']

In [60]:
#companies_w_articles

# Word Vectorization

In [55]:
# for a give model, we want to get the first 20 words related to a company of: companies_w_articles
# And store everything into a dictionary like for tf.idf
def getTopWords(model, n_words, companies):
        #Word2Vec.most_similar(positive=[], negative=[], topn=10, restrict_vocab=None, indexer=None)
    relevant_words = {}
    for company in companies_w_articles:
        if company in model.wv.vocab:
            relevant_words[company] = model.most_similar(company,topn=n_words)
    return relevant_words

In [28]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [22]:
data = [] 
  
# iterate through each article in the file 
data = [word_tokenize(plain_text) for plain_text in df_cleaned["plain_text"]]

In [26]:
PATH = "./data/"
file = "list_tokenized_pt"
with open (PATH +file, 'rb') as fp:
    data_test = pickle.load(fp)

## Global Matrix factorization to get top 20 words of a company

In [23]:
# LSA
# HAL (Hyper Analogue Language)
# CBOW

# Create CBOW model 
model_cbow = Word2Vec(data, min_count = 1, size = 100, window = 5)


In [31]:
PATH = "./data/models/"
file = "CBOW_model_200k"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_cbow = word_vectors

In [32]:
print (model_cbow.most_similar('google'))
vec = model_cbow['king'] - model_cbow['man'] + model_cbow['woman']
print ()
print (model_cbow.most_similar([vec]))
print()
print(model_cbow.similarity('apple', 'man'))

[('googles', 0.7780945301055908), ('apple', 0.7284873723983765), ('googl', 0.704851508140564), ('alphabet', 0.6509729623794556), ('spotify', 0.6320379972457886), ('facebook', 0.6315559148788452), ('microsoft', 0.6176584362983704), ('alphabets', 0.6062008142471313), ('apps', 0.605197548866272), ('stadia', 0.5996901988983154)]

[('king', 0.8107793927192688), ('godfather', 0.5998413562774658), ('thatcher', 0.5920987129211426), ('mitford', 0.5835937261581421), ('altimus', 0.5723137259483337), ('chemouny', 0.5638600587844849), ('atwood', 0.5631056427955627), ('macbeth', 0.5596096515655518), ('enid', 0.5574297308921814), ('antoinette', 0.5557938814163208)]

-0.056162722


## Local context window methods to get top 20 words on a company

In [25]:
#skip- gram

# Create Skip Gram model 
model_sg = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 4) 

In [34]:
PATH = "./data/models/"
file = "skip-gram_model_test"
word_vectors = KeyedVectors.load(PATH + file+".kv", mmap='r')
model_sg = word_vectors

In [35]:
print (model_sg.most_similar('apple'))
vec = model_sg['king'] - model_sg['man'] +model_sg['woman']
print ()
print (model_sg.most_similar([vec]))
print()
print(model_sg.similarity('apple', 'man'))

[('google', 0.7284873723983765), ('iphone', 0.6975679397583008), ('spotify', 0.666739821434021), ('googles', 0.6604712009429932), ('earpods', 0.6569204926490784), ('android', 0.6541101932525635), ('apples', 0.6331382393836975), ('homekit', 0.6308133602142334), ('aapl', 0.6295210719108582), ('ios', 0.6281814575195312)]

[('king', 0.8107793927192688), ('godfather', 0.5998413562774658), ('thatcher', 0.5920987129211426), ('mitford', 0.5835937261581421), ('altimus', 0.5723137259483337), ('chemouny', 0.5638600587844849), ('atwood', 0.5631056427955627), ('macbeth', 0.5596096515655518), ('enid', 0.5574297308921814), ('antoinette', 0.5557938814163208)]

-0.056162722


## GloVe to get top 20 words of a company

In [28]:
# GloVe is a global log-bilinear regression model
from gensim.scripts.glove2word2vec import glove2word2vec
#glove_input_file = 'glove.txt'
#word2vec_output_file = 'word2vec.txt'
#glove2word2vec(glove_input_file, word2vec_output_file)

In [36]:
import gensim.downloader as api
glove_model = api.load('glove-wiki-gigaword-300')
#https://github.com/stanfordnlp/GloVe

In [72]:
print (glove_model.most_similar('apple'))
vec = glove_model['king'] - glove_model['man'] +glove_model['woman']
print ()
print (glove_model.most_similar([vec]))
print()
print(glove_model.similarity('apple', 'man'))

[('iphone', 0.5987042188644409), ('macintosh', 0.5836331248283386), ('ipod', 0.5761123895645142), ('microsoft', 0.5663833022117615), ('ipad', 0.5628098249435425), ('intel', 0.5457563400268555), ('ibm', 0.5286195278167725), ('google', 0.5282472372055054), ('imac', 0.5072520971298218), ('software', 0.4962984323501587)]

[('king', 0.8065859079360962), ('queen', 0.689616322517395), ('monarch', 0.5575490593910217), ('throne', 0.5565374493598938), ('princess', 0.5518684387207031), ('mother', 0.5142154693603516), ('daughter', 0.5133156776428223), ('kingdom', 0.5025345087051392), ('prince', 0.5017740726470947), ('elizabeth', 0.49080315232276917)]

0.090478964


In [31]:
import os
import sys
DIR = "./data/"
embeddings_index = {}

# Pre-trained Glove
#if option == 1:
with open(os.path.join(DIR, 'glove.840B.300d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 2195884 word vectors.


In [32]:
np.dot(embeddings_index["apple"],embeddings_index["iphone"])

34.061054

In [None]:
#wv = KeyedVectors.load("./data/" + 'glove.840B.300d.txt')

## fasttext

# Combine all the models into one & score each word

In [59]:

relevant_words_glove = getTopWords(glove_model,20,companies_w_articles)
relevant_words_cbow = getTopWords(model_cbow,20,companies_w_articles)
relevant_words_sg = getTopWords(model_sg,20,companies_w_articles)
print ("relevant_words_glove",len(relevant_words_glove))
print ("relevant_words_cbow",len(relevant_words_cbow))
print ("relevant_words_sg",len(relevant_words_sg))


relevant_words_glove 23
relevant_words_cbow 24
relevant_words_sg 24


# ------------------- Annexe Testing -------------------------------

## Python program to generate word vectors using Word2Vec 

In [57]:
# importing all necessary modules 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
#nltk.download('punkt')
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 

import sklearn
from sklearn.feature_extraction.text import CountVectorizer

import collections

In [62]:
# Apply 2 Word2Vec models to articles   

data = [] 
  
# iterate through each article in the file 
for i in clean_articles: 
    temp = [] 
    # tokenize the article into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 
  
# Print results 
print("Cosine similarity between 'australia' " + 
               "and 'melbourne' - CBOW : ", 
    model1.similarity('melbourne', 'australia')) 

print(model1.wv.most_similar('melbourne'))
    

# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 4) 
  
# Print results 
print("Cosine similarity between 'australia' " +
          "and 'melbourne' - Skip Gram : ", 
    model2.similarity('melbourne', 'australia')) 
print(model2.wv.most_similar('melbourne'))


NameError: name 'clean_articles' is not defined

In [59]:
# FOR GENSIN USING CBOW Manipulations

# enumerate data it is trained on
for i, word in enumerate(model1.wv.vocab):
    if i == 5:
        break
    print(word)

NameError: name 'model1' is not defined

In [60]:
len(data)
# Show frequencies
#print("Original List : ",data)
data_flat = []
for line in data:
    for word in line:
        data_flat.append(word)


ctr = collections.Counter(data_flat)
#print("Frequency of the elements in the List : ",ctr)
ctr["the"] # count of word "the"

0

## Using tf.itf

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Vectorizer
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(clean_articles)

#print(vectorizer.get_feature_names()[:10])
#print(X.shape)
#print(vectorizer.get_stop_words())
#print(vectorizer.get_params(deep=True))

n_articles, n_distinct_words = X.shape
print(n_articles, n_distinct_words)

collect_word_importance = []
#place tf-idf values in a pandas data frame 
for tf_idf_vector_id in range(n_articles):
    
    tf_idf_vector=X[tf_idf_vector_id]
    #print (tf_idf_vector.todense().sum())
    #print (tf_idf_vector.T.todense())
    df = pd.DataFrame(tf_idf_vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"]) 
    df_word_importance = df.sort_values(by=["tfidf"],ascending=False)
    word_importance_list = np.array(df_word_importance.index)
    collect_word_importance.append(word_importance_list)


In [None]:
# Each line corresponds to the highest scored words in the article of same index.
collect_word_importance = np.array(collect_word_importance)
collect_word_importance

In [None]:
# TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
# TfidfTransformer
#TfidfTransformer(*, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]


corpus = ['this is the first document',
           'this document is the second document',
          'and this is the third one',
           'is this the first document']
vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
               'and', 'one']
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
                  ('tfid', TfidfTransformer())]).fit(corpus)
pipe['count'].transform(corpus).toarray()
pipe['tfid'].idf_
pipe.transform(corpus).shape


In [None]:
pipe = Pipeline([('count', CountVectorizer()),
                  ('tfid', TfidfTransformer())]).fit(clean_articles)
pipe['count'].transform(clean_articles).toarray().shape
print (pipe['tfid'].idf_)
Tfidf_res = pipe.transform(clean_articles)
Tfidf_res.shape

In [None]:
Tfidf_res

In [None]:
#### Tutorial

#Dataset and Imports
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences 
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [None]:
# Initialize CountVectorizer
#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
# 5 texts, 9 distinct words -> gives the count for each word in each text

In [None]:
word_count_vector[0]

In [None]:
#Compute the IDF values
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

In [None]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

In [None]:
# Compute the TFIDF score for your documents
# count matrix 
count_vector=cv.transform(docs) #<==> word_count_vector

# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:
tf_idf_vector

In [None]:
feature_names = cv.get_feature_names() 
 
#get tfidf vector for FFFFFFFFFirst document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores (Tf-idf scores of first document)
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
# Tfidfvectorizer Usage - Compute all at Once

 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
#fitted_vectorizer=tfidf_vectorizer.fit(docs)               # This method would work too
#tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)  
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [None]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)
