In [39]:
import os
from pathlib import Path
import json
import spacy
import pandas as pd
from spacy_download import load_spacy
from sklearn.feature_extraction.text import TfidfVectorizer


# Will download the model if it isn't installed yet
nlp = load_spacy("en_core_web_lg")  

In [40]:
df = pd.read_json('./cleaned_kaggle_data.json', encoding='utf-8', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,cleaned_text,text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,4 million Americans roll sleeve Omicron target...,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,American Airlines Flyer charge ban life Punchi...,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 Funniest tweet Cats Dogs Week Sept. 17 23 d...,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,Funniest Tweets parent Week Sept. 17 23 accide...,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,woman call cop Black Bird Watcher lose Lawsuit...,Woman Who Called Cops On Black Bird-Watcher Lo...


In [41]:
categories = df.groupby('category')
group_names = list(categories.groups.keys())
print(group_names)

['ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE', 'COMEDY', 'CRIME', 'DIVORCE', 'EDUCATION', 'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK', 'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT', 'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS', 'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'TRAVEL', 'U.S. NEWS', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS', 'WOMEN', 'WORLD NEWS', 'WORLDPOST']


In [42]:
vectorizer = TfidfVectorizer()

In [43]:
word_dictionary = {}

In [51]:
for group in group_names:
    category_df = df[df['category'] == group]
    category_df['text'] = [" ".join([token.text.lower() for token in nlp(doc) if not token.is_stop and not token.is_punct]) for doc in category_df['text'].values]
    print(category_df['text'].values)

    tfidf_matrix = vectorizer.fit_transform(category_df['text'].values)
    feature_names = vectorizer.get_feature_names_out()

    sums = tfidf_matrix.sum(axis=0)

    # Connecting term to its sums frequency
    data = []
    for col, term in enumerate(feature_names):
        data.append((term, sums[0, col]))

    ranking = sorted(data, key=lambda x: x[1], reverse=True)
    top_15_words = ranking[:15]

    word_dictionary[group] = top_15_words

    print(f'Top 15 words by TF-IDF for category {group}:')
    for word, score in top_15_words:
        print(f"{word}: {score:.4f}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_df['text'] = [" ".join([token.text.lower() for token in nlp(doc) if not token.is_stop and not token.is_punct]) for doc in category_df['text'].values]


['new documentary captures complexity child immigrants mija director isabel castro combined music documentaries style euphoria clueless tell nuanced immigration story'
 'reboot clever navel gazey look inside tv reboots starring keegan michael key judy greer johnny knoxville hulu follows revival fictional early 2000s sitcom'
 'meet alex aster tiktoker changing publishing industry better colombian american author new book lightlark rejected numerous times went directly readers support'
 ...
 'think look chemical brothers concert film set hit theaters amid cheers occasional theater speakers duo danced songs eventually'
 'matthew marks discusses new la gallery obvious choice recruit ellsworth kelly inaugural exhibition 15th solo matthew'
 'allard van hoorn urban songline explores relationship sound space materials photos video recent exhibition storefront art architecture shifting connection song space nomadic']
Top 15 words by TF-IDF for category ARTS & CULTURE:
art: 83.6395
new: 52.8132


In [52]:
word_dictionary

{'group': ['gülen',
  'même',
  'aécio',
  'césar',
  'déjà',
  'détente',
  'génération',
  'médecins',
  'mélenchon',
  'méxico',
  'méxicoleaks',
  'république',
  'são',
  'cárdenas',
  'márquez'],
 'ARTS & CULTURE': [('art', 83.63949069818536),
  ('new', 52.8131740526297),
  ('artist', 50.021757938486616),
  ('photos', 43.58640330420436),
  ('world', 38.931196533094536),
  ('artists', 32.59012658923796),
  ('women', 30.717697785915956),
  ('like', 29.874106767424884),
  ('book', 29.68123912206789),
  ('life', 26.83242427446607),
  ('year', 25.863769551404683),
  ('time', 25.728323701579875),
  ('work', 24.99027747511592),
  ('people', 24.96772855340489),
  ('trump', 23.701004428356132)],
 'BLACK VOICES': [('black', 140.790856353749),
  ('new', 56.77524408482455),
  ('police', 55.58627806040897),
  ('people', 49.17559274356935),
  ('white', 44.52010607734269),
  ('said', 41.39137219085332),
  ('women', 40.241070238711416),
  ('year', 40.119275638793894),
  ('says', 36.3343623810646

In [53]:
with open('./top_15_per_category.json', 'w', encoding='utf-8') as file:
    json.dump(word_dictionary, file, indent=4)