In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import string
from typing import List

In [10]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# init NLP objects
stopwords = nltk.corpus.stopwords.words('english') #+ nltk.corpus.stopwords.words('french')
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [80]:
type(porter_stemmer)

nltk.stem.porter.PorterStemmer

In [2]:
# preprocessing functions

#defining the function to remove punctuation
def remove_punctuation(text: str):
    punctuation_free = "".join([i for i in text if i not in string.punctuation])
    return punctuation_free

def tokenization(text: str) -> List[str]:
    # tokens = re.split('W+',text)
    l_tokens = re.findall(r'(?i)((?:[a-z]|\')+)', text)
    return l_tokens

def remove_stopwords(l_tokens: List[str]) -> List[str]:
    l_output = [token for token in l_tokens if token not in stopwords]
    return l_output

def stemming(l_tokens: List[str]) -> List[str]:
    l_stem_text = [porter_stemmer.stem(word) for word in l_tokens]
    return l_stem_text

def lemmatizer(l_tokens: List[str]) -> List[str]:
    l_lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in l_tokens]
    return l_lemm_text

def filter_len(l_tokens: List[str], limit: int) -> List[str]:
    l_filtered = [token for token in l_tokens if len(token) > limit]
    return l_filtered

In [2]:
df = pd.read_parquet("../data/01_raw/medium_extract.parquet")
df = df.drop_duplicates()

In [6]:
# df.day.value_counts()
# df.drop_duplicates()
df

Unnamed: 0,title,author,link,text,day,index
0,Why decorators in Python are pure genius | by ...,Ari Joury,https://towardsdatascience.com/why-decorators-...,"Towards Data Science\nMar 9, 2021\nSave\nIf th...",2021-03-09,0
1,The Ultimate Guide to Cracking Product Case In...,Emma Ding,https://towardsdatascience.com/the-ultimate-gu...,"Towards Data Science\nMar 9, 2021\nSave\nWritt...",2021-03-09,1
2,fit() vs predict() vs fit_predict() in Python ...,Giorgos Myrianthous,https://towardsdatascience.com/fit-vs-predict-...,"Towards Data Science\nMar 9, 2021\nSave\nsciki...",2021-03-09,2
3,Stepwise Regression Tutorial in Python | by Ry...,Ryan Kwok,https://towardsdatascience.com/stepwise-regres...,"Towards Data Science\nMar 9, 2021\nSave\nHow d...",2021-03-09,3
4,7 Must-Know Data Wrangling Operations with Pyt...,Soner Yıldırım,https://towardsdatascience.com/7-must-know-dat...,"Towards Data Science\nMar 9, 2021\nSave\nA com...",2021-03-09,4
...,...,...,...,...,...,...
1769,Expanding Review Inequality in Madden | by Cod...,"Cody Glickman, PhD",https://towardsdatascience.com/expanding-revie...,"Towards Data Science\nNov 23, 2021\nSave\nIn 2...",2021-11-23,1769
1770,Geocoding Locations with Turf Mapbox SDK | by ...,Charmaine Chui,https://towardsdatascience.com/geocoding-locat...,"Towards Data Science\nNov 23, 2021\nSave\nMost...",2021-11-23,1770
1771,A No-Code Method of Mapping UFO Sightings with...,Shreya Chaudhary,https://towardsdatascience.com/a-no-code-metho...,"Towards Data Science\nNov 23, 2021\nSave\nIn t...",2021-11-23,1771
1772,"Fines Migrate in Espresso, but Not Far: Part 3...",Robert McKeon Aloe,https://towardsdatascience.com/fines-migrate-i...,"Towards Data Science\nNov 23, 2021\nSave\nOne ...",2021-11-23,1772


In [5]:
df["index"] = df.index

In [25]:
# Apply text preprocessing
df = df.assign(text = df.text.str.lower())
df = df.assign(text = df.text.apply(lambda x: tokenization(x)))
df = df.assign(text = df.text.apply(lambda x: remove_stopwords(x)))
df = df.assign(text = df.text.apply(lambda x: stemming(x)))
df = df.assign(text = df.text.apply(lambda x: lemmatizer(x)))
df = df.assign(text = df.text.apply(lambda x: filter_len(x, 2)))

# Adapt format for sklearn's TF-IDF
df = df.assign(text = df.text.apply(lambda x: " ".join(x)))

In [27]:
df.text.head()

0    toward data scienc mar save one thing make pyt...
1    toward data scienc mar save written emma ding ...
2    toward data scienc mar save scikit learn commo...
3    toward data scienc mar save find mean data min...
4    toward data scienc mar save comprehens practic...
Name: text, dtype: object

In [94]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [96]:
type(df_test)

pandas.core.frame.DataFrame

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
TF_IDF = vectorizer.fit(df_train.text)

In [83]:
df_enc = pd.DataFrame(TF_IDF.transform(df_test.text).toarray())

In [84]:
df_enc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31097,31098,31099,31100,31101,31102,31103,31104,31105,31106
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
type(TF_IDF.get_feature_names_out())

numpy.ndarray

In [86]:
ax_labels = TF_IDF.get_feature_names_out()
n_labels = ax_labels.shape[0]

d_labels = {i: ax_labels[i] for i in range(n_labels)}

In [87]:
df_enc = df_enc.rename(columns=d_labels)

In [88]:
df_enc

Unnamed: 0,aaa,aaai,aab,aac,aachen,aad,aadidev,aaf,aafdb,aagesen,...,zwiebel,zwift,zwikirsch,zxcvbnm,zxh,zykov,zyl,zyrobot,zyte,zzhu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
doc_id = 2

s_doc = df_enc.loc[doc_id, :]
s_doc.sort_values(ascending=False)[:5].index.tolist()

['dashboard', 'chart', 'dash', 'graph', 'style']

In [90]:
df_test

Unnamed: 0,title,author,link,text,day
300,I Forgot How to Spellcheck | by Victor Shepele...,Victor Shepelev,https://towardsdatascience.com/i-forgot-how-to...,toward data scienc may save start rebuild worl...,2021-05-06
1075,How To Delete A Column In Pandas | Towards Dat...,Giorgos Myrianthous,https://towardsdatascience.com/how-to-delete-a...,toward data scienc aug save delet column panda...,2021-08-11
535,Serie A (football) — a simple dashboard with P...,Xiao Wang,https://towardsdatascience.com/create-a-simple...,toward data scienc mar save sinc last time pla...,2021-03-01
1318,Anonymise Sensitive Data in a Pandas DataFrame...,Rhys Kilian,https://towardsdatascience.com/anonymise-sensi...,toward data scienc sep save common scenario en...,2021-09-01
561,The paradox in protecting personal information...,Simone Jeurissen,https://towardsdatascience.com/the-paradox-in-...,toward data scienc mar save walk street maastr...,2021-03-01
...,...,...,...,...,...
101,A Simple Guide to Linear Regression using Pyth...,Frank Andrade,https://towardsdatascience.com/a-simple-guide-...,toward data scienc oct save one first machin l...,2021-10-18
1714,Visual Studio Code is Now Available as a Web A...,Dario Radečić,https://towardsdatascience.com/visual-studio-c...,toward data scienc nov save visual studio code...,2021-11-04
107,Secure Password Handling in Python | by Martin...,Martin Heinz,https://towardsdatascience.com/secure-password...,toward data scienc oct save almost everi appli...,2021-10-18
1656,What is a Data Hub?. Why Architectures such as...,Christianlauer,https://towardsdatascience.com/what-is-a-data-...,toward data scienc jan save data hub data exch...,2021-01-18


In [91]:
df_test = pd.read_parquet("../data/03_primary/articles.parquet")

In [92]:
df_test

Unnamed: 0,title,author,link,text,day
0,Why decorators in Python are pure genius | by ...,Ari Joury,https://towardsdatascience.com/why-decorators-...,toward data scienc mar save one thing make pyt...,2021-03-09
1,The Ultimate Guide to Cracking Product Case In...,Emma Ding,https://towardsdatascience.com/the-ultimate-gu...,toward data scienc mar save written emma ding ...,2021-03-09
2,fit() vs predict() vs fit_predict() in Python ...,Giorgos Myrianthous,https://towardsdatascience.com/fit-vs-predict-...,toward data scienc mar save scikit learn commo...,2021-03-09
3,Stepwise Regression Tutorial in Python | by Ry...,Ryan Kwok,https://towardsdatascience.com/stepwise-regres...,toward data scienc mar save find mean data min...,2021-03-09
4,7 Must-Know Data Wrangling Operations with Pyt...,Soner Yıldırım,https://towardsdatascience.com/7-must-know-dat...,toward data scienc mar save comprehens practic...,2021-03-09
...,...,...,...,...,...
1812,Climate change: Interactive Plotly Dash to dis...,Francesco Tontarelli,https://towardsdatascience.com/climate-change-...,toward data scienc may save author camilla mas...,2021-05-28
1813,Where eBay Went Right — and Wrong — with AI: W...,Wilson Pang,https://towardsdatascience.com/where-ebay-went...,toward data scienc may save follow adapt real ...,2021-05-28
1814,Automatically track all your EDA using Sweetvi...,Francois Bertrand,https://towardsdatascience.com/automatically-t...,toward data scienc may save sweetviz great lib...,2021-05-28
1815,The White Rhinos of Linguistics | by Yaning Wu...,Yaning Wu,https://towardsdatascience.com/the-white-rhino...,toward data scienc may save piec best experien...,2021-05-28


In [98]:
type(TF_IDF)
type(vectorizer)

sklearn.feature_extraction.text.TfidfVectorizer

In [100]:
pickle TF_IDF

In [7]:
import pickle
import joblib

tf_idf = joblib.load("../data/06_models/tf_idf.pkl")


In [8]:
tf_idf

In [115]:
df

Unnamed: 0,title,author,link,text,day
0,Why decorators in Python are pure genius | by ...,Ari Joury,https://towardsdatascience.com/why-decorators-...,"Towards Data Science\nMar 9, 2021\nSave\nIf th...",2021-03-09
1,The Ultimate Guide to Cracking Product Case In...,Emma Ding,https://towardsdatascience.com/the-ultimate-gu...,"Towards Data Science\nMar 9, 2021\nSave\nWritt...",2021-03-09
2,fit() vs predict() vs fit_predict() in Python ...,Giorgos Myrianthous,https://towardsdatascience.com/fit-vs-predict-...,"Towards Data Science\nMar 9, 2021\nSave\nsciki...",2021-03-09
3,Stepwise Regression Tutorial in Python | by Ry...,Ryan Kwok,https://towardsdatascience.com/stepwise-regres...,"Towards Data Science\nMar 9, 2021\nSave\nHow d...",2021-03-09
4,7 Must-Know Data Wrangling Operations with Pyt...,Soner Yıldırım,https://towardsdatascience.com/7-must-know-dat...,"Towards Data Science\nMar 9, 2021\nSave\nA com...",2021-03-09
...,...,...,...,...,...
1769,Expanding Review Inequality in Madden | by Cod...,"Cody Glickman, PhD",https://towardsdatascience.com/expanding-revie...,"Towards Data Science\nNov 23, 2021\nSave\nIn 2...",2021-11-23
1770,Geocoding Locations with Turf Mapbox SDK | by ...,Charmaine Chui,https://towardsdatascience.com/geocoding-locat...,"Towards Data Science\nNov 23, 2021\nSave\nMost...",2021-11-23
1771,A No-Code Method of Mapping UFO Sightings with...,Shreya Chaudhary,https://towardsdatascience.com/a-no-code-metho...,"Towards Data Science\nNov 23, 2021\nSave\nIn t...",2021-11-23
1772,"Fines Migrate in Espresso, but Not Far: Part 3...",Robert McKeon Aloe,https://towardsdatascience.com/fines-migrate-i...,"Towards Data Science\nNov 23, 2021\nSave\nOne ...",2021-11-23


In [10]:
df.head()

Unnamed: 0,title,author,link,text,day
0,Why decorators in Python are pure genius | by ...,Ari Joury,https://towardsdatascience.com/why-decorators-...,"Towards Data Science\nMar 9, 2021\nSave\nIf th...",2021-03-09
1,The Ultimate Guide to Cracking Product Case In...,Emma Ding,https://towardsdatascience.com/the-ultimate-gu...,"Towards Data Science\nMar 9, 2021\nSave\nWritt...",2021-03-09
2,fit() vs predict() vs fit_predict() in Python ...,Giorgos Myrianthous,https://towardsdatascience.com/fit-vs-predict-...,"Towards Data Science\nMar 9, 2021\nSave\nsciki...",2021-03-09
3,Stepwise Regression Tutorial in Python | by Ry...,Ryan Kwok,https://towardsdatascience.com/stepwise-regres...,"Towards Data Science\nMar 9, 2021\nSave\nHow d...",2021-03-09
4,7 Must-Know Data Wrangling Operations with Pyt...,Soner Yıldırım,https://towardsdatascience.com/7-must-know-dat...,"Towards Data Science\nMar 9, 2021\nSave\nA com...",2021-03-09


In [22]:
df_test = pd.read_parquet("../data/05_model_input/test.parquet")
tf_idf = joblib.load("../data/06_models/tf_idf.pkl")

df_test["index"] = df_test.index
df_test = df_test.rename(columns={"index": "id"})
df_enc = pd.DataFrame(tf_idf.transform(df_test.text).toarray())

# Create column id / label mapping
ax_labels = tf_idf.get_feature_names_out()
n_labels = ax_labels.shape[0]
d_labels = {i: ax_labels[i] for i in range(n_labels)}


In [31]:
df_test = df_test.reset_index(drop=True)

In [15]:
df_enc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31277,31278,31279,31280,31281,31282,31283,31284,31285,31286
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# doc_id = 2
n_topics = 5

d_topics = {}

for index, row in df_enc.iterrows():
    # print(row)
    # print(index)
    # article_id = df_test.loc[index, "id"]
    # s_doc = df_enc.loc[doc_id, :]
    article_id = df_test.loc[index, "id"]
    l_topics = row.sort_values(ascending=False)[:n_topics].index.tolist()
    l_topics = [d_labels[elt] for elt in l_topics]

    d_topics[article_id] = l_topics
    break



In [39]:
d_topics

{1507: ['arm', 'sampl', 'bandit', 'regret', 'algorithm']}

In [4]:
import pickle
import joblib

d_res = joblib.load("../data/07_model_output/test_predictions.pkl")

In [1]:
import pandas as pd

df_train = df_test = pd.read_parquet("../data/05_model_input/train.parquet")
df_test = pd.read_parquet("../data/05_model_input/test.parquet")

In [3]:
pd.options.display.max_colwidth = 200

df_test#[["id", "title"]]#.iloc[1507].title

Unnamed: 0,id,title,author,link,text,day
526,526,"Feature Selection, Binning, ANOVA, polynomial features, log transform, automatic feature selection | Towards Data Science",Ibrahim Kovan,https://towardsdatascience.com/an-overview-of-data-preprocessing-features-enrichment-automatic-feature-selection-60b0c12d75ad?source=collection_archive---------14-----------------------,toward data scienc aug save dataset render suitabl data train machin learn predict made algorithm yield success result look dataset seen featur import other impact output exampl better result obta...,2021-08-02
354,354,Top Python libraries for Image Augmentation in Computer Vision | by Kenneth Leung | Towards Data Science,Kenneth Leung,https://towardsdatascience.com/top-python-libraries-for-image-augmentation-in-computer-vision-2566bed0533e?source=collection_archive---------2-----------------------,toward data scienc aug save deep learn task like comput vision highli depend larg number imag train techniqu like transfer learn reduc amount data need dataset suffici qualiti quantiti varieti sti...,2021-08-02
168,168,Image Classification of PCBs and its Web Application (Flask) | by Utkarsh Ankit | Towards Data Science,Utkarsh Ankit,https://towardsdatascience.com/image-classification-of-pcbs-and-its-web-application-flask-c2b26039924a?source=collection_archive---------8-----------------------,toward data scienc aug save hello blog creat imag classif model pcb print circuit board detect defect pcb classifi good bad creat deep learn model tri get best possibl result along proper visualis...,2021-08-02
135,135,Variational Bayesian inference with normalizing flows: a simple example | by Fraser Lewis | Towards Data Science,Fraser Lewis,https://towardsdatascience.com/variational-bayesian-inference-with-normalizing-flows-a-simple-example-1db109d91062?source=collection_archive---------7-----------------------,toward data scienc aug save variat infer methodolog watch larg complex bayesian model potenti becom approach least comput fit natur domain machin learn articl demonstr normal flow briefli introduc...,2021-08-02
937,937,What does Vertex AI do?. Your one-stop shop for machine learning | by Omer Mahmood | Towards Data Science,Omer Mahmood,https://towardsdatascience.com/what-does-vertex-ai-do-d30014024f55?source=collection_archive---------9-----------------------,toward data scienc aug save post cover common task typic machin learn workflow vertex bring togeth tool need achiev one unifi user interfac today data scientist grappl challeng manual piec togeth ...,2021-08-02
...,...,...,...,...,...,...
1074,1074,Machine Learning with only SQL — Using BigQuery to Identify the Target Audience for Shared Bike | by Wan Chung Huang | Towards Data Science,Wan Chung Huang,https://towardsdatascience.com/machine-learning-with-only-sql-using-bigquery-to-identify-the-target-audience-for-shared-bike-aa3a4041be3a?source=collection_archive---------18-----------------------,toward data scienc aug save mani reason use python machin learn matter case share machin learn sql understand key featur influenc durat bike share trip observ develop data warehous averag durat bi...,2021-08-02
1315,1315,Build a Dash app with Python in 7 minutes | by Natassha Selvaraj | Towards Data Science,Natassha Selvaraj,https://towardsdatascience.com/build-a-dash-app-with-python-in-7-minutes-72b6cca7d268?source=collection_archive---------3-----------------------,toward data scienc aug save start work capston project month ago want creat interact dashboard machin learn model want dashboard form web applic display live updat new data enter system explor man...,2021-08-02
383,383,Styleformer: Convert Casual Text to Formal Text and Vice Versa | by Eric Fillion | Towards Data Science,Eric Fillion,https://towardsdatascience.com/styleformer-convert-casual-text-to-formal-text-and-vice-versa-9cdc52abeaf5?source=collection_archive---------31-----------------------,toward data scienc aug save styleform brand new python librari allow chang style text use power transform model call tutori focus abil convert casual text formal text vice versa chang formal text ...,2021-08-02
1273,1273,Ensemble Classification: A Brief Overview With Examples | by Pranav Thaenraj | Towards Data Science,Pranav Thaenraj,https://towardsdatascience.com/ensemble-classification-a-brief-overview-with-examples-3dac25613073?source=collection_archive---------25-----------------------,toward data scienc aug save note articl final articl seri articl regard classif transport poi data first articl look use variou machin learn model classifi record airport stop train station second...,2021-08-02


In [36]:
# df_test

In [5]:
d_res#[1417]

{526: ['bin', 'featur', 'dataset', 'column', 'select'],
 354: ['augment', 'imag', 'star', 'librari', 'github'],
 168: ['imag', 'train', 'pcb', 'model', 'folder'],
 135: ['surrog', 'distribut', 'elbo', 'model', 'flow'],
 937: ['vertex', 'model', 'imag', 'train', 'automl'],
 1544: ['imag', 'train', 'pcb', 'model', 'folder'],
 1253: ['price', 'elast', 'product', 'competitor', 'cross'],
 237: ['mito', 'data', 'column', 'new', 'pivot'],
 478: ['perceptilab', 'model', 'layer', 'graphic', 'learn'],
 650: ['label', 'snorkel', 'model', 'weak', 'gener'],
 514: ['augment', 'imag', 'star', 'librari', 'github'],
 1103: ['summar', 'ctrlsum', 'text', 'summari', 'queri'],
 968: ['imag', 'train', 'pcb', 'model', 'folder'],
 1569: ['queri', 'field', 'elasticsearch', 'document', 'movi'],
 584: ['imag', 'train', 'pcb', 'model', 'folder'],
 65: ['queri', 'field', 'elasticsearch', 'document', 'movi'],
 30: ['perceptilab', 'model', 'layer', 'graphic', 'learn'],
 904: ['imag', 'train', 'pcb', 'model', 'folder