In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter


In [68]:
corrections = [
    'The Internal Revenue Service will kick off the approaching tax season with a backlog of at least 10 million unprocessed returns from last year, according to a new report by the National Taxpayer Advocate.',
    'The pile of returns remaining are from the “most challenging year taxpayers and tax professionals have ever experienced,” the advocate, Erin M. Collins, wrote in her report.',
    'One reason for the backlog: Stimulus payments to the people from the government during the pandemic have been largely routed through the I.R.S., which Ms. Collins said was already short staffed.',
    'The vast majority of taxpayers — 77 percent — received refunds in 2021, but tens of millions of them experienced delays. Ms. Collins called last year’s situation “horrendous” from the standpoint of taxpayers. Although the backlog is not too different from last season’s, it is far higher than the backlog the I.R.S. typically faced before the pandemic began.']

corruptions = [
    'The latest Covid-19 wave has left millions of Americans scrambling for tests, braving long lines in the cold at pop-up sites or searching furiously online for kits to use at home. But for a select group of employees at some of the country’s largest companies, tests are free and often readily available.',
    'Without an adequate federal system for developing and distributing rapid tests, companies have put their own testing services in place.',
    'Google will send full-time employees in the United States free at-home tests that deliver results within minutes and retail for more than $70 each. BlackRock, an investment firm that manages nearly $10 trillion in assets, offers tele-health supervision as employees self-administer rapid tests for international travel. At JPMorgan Chase, bankers, including those at its retail sites, can order at-home rapid tests from an internal company site.',
    'Some companies are using the tests to call their staff back to the office. For others, at-home Covid testing has become the newest wellness benefit, a perk to keep employees healthy and working — even from their couches — while providing peace of mind.']

all_corrections = ' '.join(corrections) 
all_corruptions = ' '.join(corruptions)

In [87]:
vectorizer = TfidfVectorizer(use_idf=True)
vectorizer.fit_transform([all_corrections, all_corruptions])
feats = np.array(vectorizer.get_feature_names())

In [94]:
transformed_corrections = np.array(vectorizer.transform([all_corrections]).todense())[0]
transformed_corruptions = np.array(vectorizer.transform([all_corruptions]).todense())[0]

In [102]:
# most frequest correction words 
feats[np.argsort(transformed_corrections)][::-1][:50]

array(['the', 'of', 'backlog', 'from', 'last', 'taxpayers', 'collins',
       'year', 'report', 'season', 'advocate', 'ms', 'experienced',
       'pandemic', 'is', 'tax', 'returns', 'have', 'to', 'in', 'which',
       'received', 'refunds', 'remaining', 'routed', 'revenue', 'said',
       'horrendous', 'higher', 'her', 'reason', 'largely', 'it',
       'million', 'not', 'new', 'one', 'national', 'most', 'wrote',
       'payments', 'people', 'kick', 'percent', 'majority', 'pile',
       'least', '2021', 'professionals', 'service'], dtype='<U13')

In [99]:
# most frequent corruption words 
feats[np.argsort(transformed_corruptions)][::-1] 

array(['tests', 'at', 'for', 'the', 'employees', 'home', 'companies',
       'an', 'their', 'rapid', 'in', 'of', 'to', 'and', 'sites', 'retail',
       'testing', 'free', 'some', 'has', 'covid', 'that', 'from', 'are',
       'order', 'others', 'or', 'minutes', 'peace', 'own', 'firm',
       'federal', 'even', 'perk', 'place', 'pop', 'each', 'providing',
       'put', 'readily', 'online', 'furiously', 'often', 'manages',
       'jpmorgan', 'keep', 'international', 'kits', 'largest',
       'including', 'latest', 'left', 'lines', 'long', 'healthy',
       'office', 'health', 'mind', 'more', 'group', 'google', 'nearly',
       'developing', 'newest', 'full', 'offers', 'distributing',
       'company', 'deliver', 'using', 'available', 'assets', 'those',
       'as', 'time', 'travel', 'trillion', 'united', 'up', 'use',
       'americans', 'country', 'administer', 'wave', 'wellness',
       'adequate', 'while', '70', '19', 'within', 'without', 'working',
       'tele', 'back', 'bankers', 'be