In [1]:
pip install keybert transformers bertopic multi_rake spacy gensim pyLDAvis wordcloud TextBlob vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
import transformers
import pandas as pd
import re

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv('/content/drive/MyDrive/RedditFooddata.csv')
#data = pd.read_csv('/content/drive/MyDrive/RedditEconomicdata.csv')

In [6]:
documents = []
for i in range(len(data)):
    text = data['MsgBody'][i]
    if len(text) > 4: # exclude comments with five or less than five words
        documents.append(re.sub("[^a-zA-Z]+ |[^ ]+\.[^ ]+ | [^ ]+/.[^ ]+ ", "", text))


In [7]:
comment_str = ''
for doc in documents:
  comment_str += doc + ' '

In [8]:
from transformers import pipeline


In [9]:
summarizer = pipeline("summarization")
summarized = summarizer(comment_str[:1000])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


In [10]:
print(summarized)

[{'summary_text': ' A truly disturbing amount of mayo I miss Orange County Asian food . This is a *brilliantlydefensive title . I like it black and blue once in a while . I prefer ribeyes mediumishCooked toand rested 10min under foil under foil .'}]


In [11]:
txtsentiment = pipeline("sentiment-analysis", truncation=True)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [12]:
i = 0
for doc in documents[:50]:
  print(i, txtsentiment(doc))
  i += 1 

0 [{'label': 'POSITIVE', 'score': 0.9990378618240356}]
1 [{'label': 'POSITIVE', 'score': 0.9923766851425171}]
2 [{'label': 'NEGATIVE', 'score': 0.9974379539489746}]
3 [{'label': 'POSITIVE', 'score': 0.9922562837600708}]
4 [{'label': 'NEGATIVE', 'score': 0.998417854309082}]
5 [{'label': 'NEGATIVE', 'score': 0.9721264243125916}]
6 [{'label': 'POSITIVE', 'score': 0.9793857336044312}]
7 [{'label': 'NEGATIVE', 'score': 0.9987534284591675}]
8 [{'label': 'NEGATIVE', 'score': 0.9996272325515747}]
9 [{'label': 'NEGATIVE', 'score': 0.9919641017913818}]
10 [{'label': 'NEGATIVE', 'score': 0.9536261558532715}]
11 [{'label': 'POSITIVE', 'score': 0.9991777539253235}]
12 [{'label': 'NEGATIVE', 'score': 0.992902934551239}]
13 [{'label': 'NEGATIVE', 'score': 0.9843872785568237}]
14 [{'label': 'NEGATIVE', 'score': 0.9979062080383301}]
15 [{'label': 'NEGATIVE', 'score': 0.9982938170433044}]
16 [{'label': 'NEGATIVE', 'score': 0.9982938170433044}]
17 [{'label': 'NEGATIVE', 'score': 0.9996563196182251}]
18 [

In [13]:
txtclsfcn_pipe = pipeline("text-classification")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [14]:
for doc in documents[:50]:
  print(txtclsfcn_pipe(doc))

[{'label': 'POSITIVE', 'score': 0.9990378618240356}]
[{'label': 'POSITIVE', 'score': 0.9923766851425171}]
[{'label': 'NEGATIVE', 'score': 0.9974379539489746}]
[{'label': 'POSITIVE', 'score': 0.9922562837600708}]
[{'label': 'NEGATIVE', 'score': 0.998417854309082}]
[{'label': 'NEGATIVE', 'score': 0.9721264243125916}]
[{'label': 'POSITIVE', 'score': 0.9793857336044312}]
[{'label': 'NEGATIVE', 'score': 0.9987534284591675}]
[{'label': 'NEGATIVE', 'score': 0.9996272325515747}]
[{'label': 'NEGATIVE', 'score': 0.9919641017913818}]
[{'label': 'NEGATIVE', 'score': 0.9536261558532715}]
[{'label': 'POSITIVE', 'score': 0.9991777539253235}]
[{'label': 'NEGATIVE', 'score': 0.992902934551239}]
[{'label': 'NEGATIVE', 'score': 0.9843872785568237}]
[{'label': 'NEGATIVE', 'score': 0.9979062080383301}]
[{'label': 'NEGATIVE', 'score': 0.9982938170433044}]
[{'label': 'NEGATIVE', 'score': 0.9982938170433044}]
[{'label': 'NEGATIVE', 'score': 0.9996563196182251}]
[{'label': 'POSITIVE', 'score': 0.99905997514724

In [15]:
print(txtclsfcn_pipe(comment_str[:512]))

[{'label': 'NEGATIVE', 'score': 0.9853014349937439}]


In [16]:
from keybert import KeyBERT

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(comment_str)

In [17]:
print(keywords)

[('meatscooked', 0.46), ('mediumishcooked', 0.4582), ('fooddelicious', 0.4516), ('sauceprecooked', 0.4423), ('tastydefinitely', 0.435)]


In [18]:
kw_model.extract_keywords(comment_str, keyphrase_ngram_range=(1, 1), stop_words=None)

[('meatscooked', 0.46),
 ('mediumishcooked', 0.4582),
 ('fooddelicious', 0.4516),
 ('sauceprecooked', 0.4423),
 ('tastydefinitely', 0.435)]

In [19]:
kw_model.extract_keywords(comment_str, keyphrase_ngram_range=(1, 2), stop_words=None)

[('ribeyes mediumishcooked', 0.5231),
 ('food colour', 0.5026),
 ('fried ribeye', 0.4988),
 ('food colouring', 0.4928),
 ('rib meat', 0.491)]

In [20]:
from bertopic import BERTopic

In [21]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)



In [22]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2542,-1_it_and_that_the
1,0,259,0_canada_drive_ontario_trip
2,1,124,1_looks_amazing_good_that
3,2,104,2_removed___
4,3,101,3_poutine_quebec_fries_cheese
...,...,...,...
147,146,11,146_weather_comforting_lifeyou_snowed
148,147,11,147_imgemotet52qh551792_imgemotet52qh551792loo...
149,148,11,148_syrup_youreal_diverse_brussel
150,149,11,149_thank_courage_indeed_above


In [23]:
topic_model.get_topic(0)


[('canada', 0.02120557602063367),
 ('drive', 0.020837627559547312),
 ('ontario', 0.019746389300418537),
 ('trip', 0.01630906542598834),
 ('place', 0.013395472071464302),
 ('from', 0.013262444102394381),
 ('away', 0.011845331877736585),
 ('toronto', 0.011638089792722716),
 ('live', 0.011303668590657826),
 ('detroit', 0.011059103294083472)]

In [24]:
topic_model.visualize_topics()

In [25]:
topic_model.visualize_barchart()

In [26]:
from multi_rake import Rake
rake = Rake()
keywords = rake.apply(comment_str)
print(keywords[:10])


[('barleythis interests me😋', 9.0), ('blah blah blah', 9.0), ('variable flameive fucked', 9.0), ('bitch auntie helen', 9.0), ('mom’s bún mọci’ll', 9.0), ('incredibly dryno sauceshame', 9.0), ('paranoid delusional fantasy', 9.0), ('sheltered theocratic scum', 9.0), ('ambient tempphotoshoot foodsee', 9.0), ('theirplatter xburgers xsubtotaltaxtipit', 9.0)]


In [27]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [28]:
lemmatizer = WordNetLemmatizer()

In [29]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'off', 'its', 'before', 'now', 'yourself', "hadn't", 'll', 'were', 'been', 'own', 'will', "you'd", 'themselves', 'wasn', 'any', 'your', 'such', 'why', 'while', 'down', 'only', 'these', 'between', "don't", "needn't", 'against', 'out', 'over', 'yours', 'after', 't', 'me', "you've", 'very', 'who', 'we', 'which', 'but', 'because', 'that', 'each', 'i', "aren't", 'my', 'further', "won't", 'during', 'them', 'myself', 'more', "shouldn't", 'so', 'having', 'than', 'itself', 'can', 'as', 'him', 'the', "isn't", 'it', 'weren', 'am', 'nor', 'where', 'yourselves', 'm', 'do', 'into', 'for', 'had', 'up', 'our', 'being', 'does', "hasn't", 'd', 'was', 'until', "weren't", 'from', 'below', 's', 'haven', 'there', 'is', 'same', 'hadn', "haven't", 'doesn', "wouldn't", 'needn', "mustn't", 'an', 'if', 'ours', 'some', 'at', 'you', "mightn't", 'he', 'just', 'above', 'too', 'has', 'aren', 'mustn'

In [30]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:

def text_tokenize(texttoken):
    txt = texttoken
    txt = txt.lower()
    tokens = nltk.tokenize.word_tokenize(txt)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

In [32]:
tfidf_doc = []
for doc in documents:
  tfidf_doc.append(doc)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_vec = vectorizer.fit_transform(tfidf_doc)

In [34]:
vectorizer.get_feature_names_out()

array(['0325', '05', '0jncztdhu8o', ..., 'çok', 'الكنافة', '油条'],
      dtype=object)

In [35]:
print(pd.DataFrame(tfidf_vec.toarray()))

      0      1      2      3      4      5      6      7      8      9      \
0       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
7519    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
7520    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
7521    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
7522    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
7523    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

      ...  14531  14532  14533  14534  14535  14536  14537  145

In [36]:
import spacy
from gensim import corpora
from gensim import models
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings('ignore')


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working



In [37]:
nlp = spacy.load('en_core_web_lg')


In [38]:
doc_spacy_tokens = []
for document in documents: 
    doc_tokens = []
    for token in nlp(document):
        if((token.is_stop is False) or (len(token) > 2) or (nlp.vocab.has_vector(str(token)))):
          doc_tokens.append(str(token.lemma_))
    doc_spacy_tokens.append(doc_tokens)

In [39]:
doc_dictionary = corpora.Dictionary(doc_spacy_tokens)
doc_corpus = [doc_dictionary.doc2bow(sentence) for sentence in doc_spacy_tokens] 
tfidf = models.TfidfModel(doc_corpus)
doc_corpus_tfidf = tfidf[doc_corpus]

In [40]:
no_topics = 10
lda_model = models.ldamodel.LdaModel(doc_corpus_tfidf, id2word=doc_dictionary, num_topics=no_topics)


In [41]:
for topic in lda_model.show_topics(num_topics=no_topics, num_words=10):
    print(topic) 

for topic_proportion in lda_model[doc_corpus_tfidf[0]]: 
    print(topic_proportion)


(0, '0.014*"thank" + 0.012*"-PRON-" + 0.008*"be" + 0.008*"the" + 0.008*"!" + 0.006*"not" + 0.006*"a" + 0.006*"." + 0.006*"do" + 0.006*"to"')
(1, '0.016*"/" + 0.014*"!" + 0.011*")" + 0.009*"r" + 0.008*"[" + 0.007*"belgian" + 0.005*"-PRON-" + 0.005*"😋" + 0.005*"be" + 0.005*"mildlypenis"')
(2, '0.006*"waffle" + 0.005*"-PRON-" + 0.004*"." + 0.004*"look" + 0.004*"in" + 0.004*"perfection" + 0.003*"be" + 0.003*"that" + 0.003*"the" + 0.003*"a"')
(3, '0.007*"-PRON-" + 0.007*"a" + 0.007*"be" + 0.007*"the" + 0.006*"sound" + 0.006*"." + 0.006*"and" + 0.005*"of" + 0.005*"cookie" + 0.004*"to"')
(4, '0.013*"-PRON-" + 0.012*"?" + 0.010*"look" + 0.009*"the" + 0.009*"be" + 0.008*"that" + 0.008*"do" + 0.007*"!" + 0.007*"a" + 0.007*"good"')
(5, '0.011*"*" + 0.005*"be" + 0.005*":)" + 0.005*"too" + 0.005*"-PRON-" + 0.004*"look" + 0.004*"." + 0.004*"incredible" + 0.003*"Blaukraut" + 0.003*"😂"')
(6, '0.010*"look" + 0.009*"great" + 0.009*"!" + 0.008*"where" + 0.006*"awesome" + 0.006*"yummy" + 0.006*"?" + 0.005

In [42]:
lda_model.print_topics(num_topics=no_topics, num_words=10)

[(0,
  '0.014*"thank" + 0.012*"-PRON-" + 0.008*"be" + 0.008*"the" + 0.008*"!" + 0.006*"not" + 0.006*"a" + 0.006*"." + 0.006*"do" + 0.006*"to"'),
 (1,
  '0.016*"/" + 0.014*"!" + 0.011*")" + 0.009*"r" + 0.008*"[" + 0.007*"belgian" + 0.005*"-PRON-" + 0.005*"😋" + 0.005*"be" + 0.005*"mildlypenis"'),
 (2,
  '0.006*"waffle" + 0.005*"-PRON-" + 0.004*"." + 0.004*"look" + 0.004*"in" + 0.004*"perfection" + 0.003*"be" + 0.003*"that" + 0.003*"the" + 0.003*"a"'),
 (3,
  '0.007*"-PRON-" + 0.007*"a" + 0.007*"be" + 0.007*"the" + 0.006*"sound" + 0.006*"." + 0.006*"and" + 0.005*"of" + 0.005*"cookie" + 0.004*"to"'),
 (4,
  '0.013*"-PRON-" + 0.012*"?" + 0.010*"look" + 0.009*"the" + 0.009*"be" + 0.008*"that" + 0.008*"do" + 0.007*"!" + 0.007*"a" + 0.007*"good"'),
 (5,
  '0.011*"*" + 0.005*"be" + 0.005*":)" + 0.005*"too" + 0.005*"-PRON-" + 0.004*"look" + 0.004*"." + 0.004*"incredible" + 0.003*"Blaukraut" + 0.003*"😂"'),
 (6,
  '0.010*"look" + 0.009*"great" + 0.009*"!" + 0.008*"where" + 0.006*"awesome" + 0.006*

In [43]:
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, doc_corpus_tfidf, doc_dictionary)
pyLDAvis.save_html(lda_visualization, '/content/drive/MyDrive/redditeconomics_lda_result.html')


In [44]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_spacy_tokens, dictionary=doc_dictionary, coherence='c_v') # by changing the values for the "coherence" parameter, you can use different coherence methods (e.g., 'u_mass', 'c_v', 'c_uci', 'c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print(coherence_lda)

0.3865748720432256


In [45]:
top_topics = lda_model.top_topics(doc_corpus) 
avg_topic_coherence = sum([t[1] for t in top_topics]) / no_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

Average topic coherence: -5.1202.


In [46]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [47]:
# Extract words from each top topic for wordlcoud
topic_words = ''
for i in range(no_topics):
    for j in range(10):
        topic_words += top_topics[i][0][j][1] + ' '

In [None]:
wordcloud = WordCloud(background_color ='white').generate(topic_words)                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [49]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [50]:
words = TextBlob(topic_words)
print(words.sentiment)

Sentiment(polarity=0.7642857142857143, subjectivity=0.7928571428571429)


In [51]:
analyzer = SentimentIntensityAnalyzer()
print(analyzer.polarity_scores(topic_words))

{'neg': 0.014, 'neu': 0.702, 'pos': 0.284, 'compound': 0.9893}


In [52]:
for document in documents[:50]:
  doc = nlp(document)
  for token in doc:
    print(token.text, token.pos_, token.tag_, token.is_stop)

This DET DT True
is AUX VBZ True
a DET DT True
* PUNCT NFP False
brilliantlydefensive ADJ JJ False
title NOUN NN False
. PUNCT . False
I PRON PRP True
like VERB VBP False
it PRON PRP True
black ADJ JJ False
and CCONJ CC True
blue ADJ JJ False
once ADV RB True
in ADP IN True
a DET DT True
whileNot PROPN NNP False
so ADV RB True
much ADV RB True
a DET DT True
matter NOUN NN False
of ADP IN True
taste NOUN NN False
but CCONJ CC True
a DET DT True
matter NOUN NN False
of ADP IN True
mood NOUN NN False
for ADP IN True
me PRON PRP True
. PUNCT . False
That SCONJ IN True
what PRON WP True
I PRON PRP True
was AUX VBD True
thinking VERB VBG False
how ADV WRB True
long ADV RB False
per ADP IN True
sideminutes NOUN NNS False
? PUNCT . False
Did AUX VBD True
you PRON PRP True
taste VERB VB False
it PRON PRP True
? PUNCT . False
Fucking INTJ UH False
A NOUN NN True
! PUNCT . False
A DET DT True
truly ADV RB False
disturbing ADJ JJ False
amount NOUN NN True
of ADP IN True
mayo NOUN NN False
I PRON P

In [53]:
noun_tokens = []
doc = nlp(comment_str)
for token in doc:
  if(token.pos_ == 'NOUN'):
    noun_tokens.append(token.text)

In [54]:
txtsentiment = pipeline("sentiment-analysis", truncation=True)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [55]:
noun_str = ' '.join(noun_tokens)
noun_str

'title matter taste matter mood sideminutes A amount mayo food mess time charcoal fire time day mall guy cut ribs beTheres comparison apples oranges beef food taste meat Tax tomin side ribeyes toand foil reason bowl maggots isgood taste opinion isgood taste opinion center peanut butter cup sauce syrup strawberries waffles ketchup Shit comment reddit comment complaintCarbs ninja watermelon kitchenthis dish fusion didcups rice cup half water things so clearAt store sausage cheese muffins bacon baguette person styles name eat stuff pig SeAL Sausage bacon eggs phoGarlic lemon grass cilantro plenty cilantro dishes garnish bean sprouts basil broth cooking fish sauce ginger cardamom version upvotes cocktopus turkey recipe rancheros size burgers burrito duckQuackapus duck bun thing day dinner days huevos friend steak ribeye thing potatos waffles hash sausage link fruit cafe menu item season hemisphere plate garlic parsley pepper corn mustard pepper salt ground rib burgerChopped arugula Emoji s

In [56]:
print(txtsentiment(noun_str))

[{'label': 'NEGATIVE', 'score': 0.9966613054275513}]


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2),stop_words='english')
tfidf_vec = vectorizer.fit([comment_str])

In [58]:
tfidf_vectors = vectorizer.get_feature_names_out()

In [59]:
tfidf_vectors[:100]

array(['0325', '0325 ds_c', '05', '05 17', '0jncztdhu8o',
       '0jncztdhu8o fbclid', '0roper', '0roper good', '10', '10 10',
       '10 im', '10 lot', '10 love', '10 pizza', '10 state', '10 using',
       '10 whats', '100', '100 notice', '1000060068',
       '1000060068 defaultskuid', '100ml', '100ml dry', '105g',
       '105g heavy', '10k', '10k calories', '10min', '10min foil', '10oz',
       '10oz beefroughlyherecourse', '10pizza', '10pizza box', '10th',
       '10th dentist', '10x', '10x percentage', '11', '11 30pmi',
       '11 https', '115g', '115g melted', '11am', '11am 1pm', '11yrs',
       '11yrs people', '12', '12 15min', '12 count', '12hour',
       '12hour flight', '13', '13 99', '1385', '1385 format', '13a',
       '13a tierhunt', '13am', '13am looks', '140g', '140g sugar', '145g',
       '145g corn', '15', '15 im', '15 laptop', '15 wow', '150c',
       '150c fanpour', '150g', '150g dark', '150k', '150k smokers', '155',
       '155 seasoned', '15g', '15g butter', '15g li

In [60]:
# Code below is used to build a custom model for keyword extraction using 
# spacy Noun words and distilbert embedding for them
# then finding cosine similarity. 
# This is widely a quoted method from BERTopic model. Link already given
# This code is NOT COMPLETE!
from transformers import AutoModel, AutoTokenizer
model_name = "distilbert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
# This code fails in Colab due to large embedding
main_tokens = tokenizer(noun_tokens, padding=True, return_tensors="pt")
mainword_embeddings = model(**main_tokens)["pooler_output"]
