In [21]:
import numpy as np
import pandas as pd
import pickle
import gensim
from nltk.tokenize import regexp_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

### Notes on Model Refinement
This code is like that in the MS_Prediction notebook, but it contains an extra function (check_topics) designed to 
check the LDA output for a test query. If the LDA topic maps to an empty string (which only happens if a consistent theme could
not be detected in the training data, thus no name is given) then the secondary topics, if there are any, are looked at. If any of those are associated 
with a named topic, this topic is promoted to primary position'''

In [258]:
wnl = WordNetLemmatizer()

def func_lemmatize(words):
    lemmatized = []
    for word, tag in pos_tag(words):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a','r','n','v'] else None
        
        lemma = wnl.lemmatize(word,wntag) if wntag else word
        lemmatized.append(lemma)
    return lemmatized

stop = set(stopwords.words('english'))

pattern = r'(\w+)'


def clean_query(query):
#    start = time.time()
    tokenized = regexp_tokenize(query,pattern)
    indiv_words = [word for word in tokenized if word.isalpha()]
    lemmatized = func_lemmatize(indiv_words)
    words = [word.lower() for word in lemmatized if word not in stop]

    return words

    
def check_topics(topics_dict,primary_topic,secondary_topics):
    while True:
        # If the primary topic is an empty string and there are secondary topics...
        if (len(topics_dict[primary_topic]) == 0) & (len(secondary_topics) !=0):    
            # Iterate through secondary topics
            for i in range(len(secondary_topics)):
                # Find the first secondary topic whose name is not an empty string
                if len(topics_dict[secondary_topics[i]]) != 0:
                    # Set that topic to primary status
                    primary_topic = secondary_topics.pop(i)
                    break
            else:
                break
        else:
            break
    
    return primary_topic, secondary_topics

 
''' Almost the same topic_prediction function defined elsewhere (see MS_Prediction script). Takes in input query and 
computes primary and secondary topics. The only difference here is that the output is run through the check_topics 
function to make sure that the primary topic is always has a name. '''    
def topic_predict(query):  
    tokenized_input = clean_query(query)
    
    corpus = id2word.doc2bow(tokenized_input)
    
    np.random.seed(4)
    output = list(lda_model[corpus])
    
    ordered = sorted(output,key=lambda x:x[1],reverse=True)
    # Determine primary topic
    primary_topic = ordered[0][0]
    # Set threshold for a topic to qualify as secondary
    threshold = 0.5
    # Assign secondary topics
    secondary_topics = [pair[0] for pair in ordered[1:] if pair[1] / ordered[0][1] > threshold]
    
    # Check LDA output and try to bump any non-empty-string 2ndary topics to 1ary if necessary
    primary_topic, secondary_topics = check_topics(topics_dict,primary_topic,secondary_topics)
    
    print(f'primary topic: {topics_dict[primary_topic]}')

    if secondary_topics:
        print('-' * 10, '\n', 'other topics:')
        for topic in secondary_topics:
            print(topics_dict[topic])

In [3]:
df_ht = pd.read_csv('models/14passes_265_topics_df_ht_lda.csv')

In [291]:
filename = 'models/topics_dict.pkl'
topics_dict = pickle.load(open(filename,'rb'))

In [292]:
topics_dict

{0: 'mexican telecommunications',
 1: 'education & student life',
 2: 'tfr',
 3: 'intl big business',
 4: 'drugs, clinical trials, approvals',
 5: 'robots & robotics',
 6: 'indian tech & business',
 7: 'meat substitutes',
 8: 'fish',
 9: 'israeli tech & business',
 10: 'tech m&a',
 11: '11',
 12: 'vacation rental',
 13: 'laptops, mobile devices, gadgets',
 14: 'audio tech',
 15: 'climate science',
 16: 'intl govt relations',
 17: 'digital advertising',
 18: 'tfr',
 19: 'marijuana & CBD',
 20: 'ridesharing services e-scooters & e-bikes',
 21: 'tfr',
 22: 'wireless charging technology',
 23: 'diversity & discrimination',
 24: 'basketball',
 25: 'mass transit',
 26: 'e-commerce & online delivery',
 27: 'european alternative energy',
 28: 'menstrual health',
 29: 'cancer treatments & trials',
 30: 'tfr',
 31: 'apple devices',
 32: 'motor racing',
 33: 'food poisoning, allergies, household chemicals',
 34: 'health insurance & washington legislation',
 35: 'asian business',
 36: 'astrophysic

In [13]:
lda_model = gensim.models.ldamodel.LdaModel.load('models/14passes_265_topics_ldamodel_ht') 

In [14]:
filename = 'models/14passes_265_topics_ldamodel_ht.id2word'
file = open(filename,'rb')
id2word = pickle.load(file)

In [146]:
tfr_keys = [key for key in topics_dict.keys() if topics_dict[key] == 'tfr']
len(tfr_keys)

55

In [293]:
''' This code finds any topics that were assigned a number in string form, rather than a name (because no unifying theme could be detected
from reading titles in the LDA-modeled training data). It changes those number-named topics to empty strings. The same is 
done for any topics named "tfr" where there were "too few records" to be able to determine a theme. Some changes were made 
to topic names assigned earlier, as well as setting title case'''

nums_to_remove = ['1','2','3','4','5','6']



topics_dict = {key:value[:-1] if value[-1] in nums_to_remove else value for (key, value) in topics_dict.items()}

topics_dict = {key:value.replace('tfr','') for (key, value) in topics_dict.items()}
topics_dict = {key:'' if len(value) <= 3 else value for (key, value) in topics_dict.items()}

topics_dict[227] = ''

topics_dict = {key:value.replace('intl','international') for (key, value) in topics_dict.items()}

topics_dict = {key:value.title() for (key, value) in topics_dict.items()}


changes = {148:'Public policy & Political Ideology',221:'Sports',64:'Mobile Networks, 5G',118:'Genetic Engineering',
           90:'Latino-American Business',99:'UK Affairs',20:'Ridesharing Services, E-Scooters & E-Bikes',
           19:'Marijuana & CBD',144:'FAANG',222:"Children's Health"
          }

topics_dict.update(changes)

topics_dict

{0: 'Mexican Telecommunications',
 1: 'Education & Student Life',
 2: '',
 3: 'International Big Business',
 4: 'Drugs, Clinical Trials, Approvals',
 5: 'Robots & Robotics',
 6: 'Indian Tech & Business',
 7: 'Meat Substitutes',
 8: 'Fish',
 9: 'Israeli Tech & Business',
 10: 'Tech M&A',
 11: '',
 12: 'Vacation Rental',
 13: 'Laptops, Mobile Devices, Gadgets',
 14: 'Audio Tech',
 15: 'Climate Science',
 16: 'International Govt Relations',
 17: 'Digital Advertising',
 18: '',
 19: 'Marijuana & CBD',
 20: 'Ridesharing Services, E-Scooters & E-Bikes',
 21: '',
 22: 'Wireless Charging Technology',
 23: 'Diversity & Discrimination',
 24: 'Basketball',
 25: 'Mass Transit',
 26: 'E-Commerce & Online Delivery',
 27: 'European Alternative Energy',
 28: 'Menstrual Health',
 29: 'Cancer Treatments & Trials',
 30: '',
 31: 'Apple Devices',
 32: 'Motor Racing',
 33: 'Food Poisoning, Allergies, Household Chemicals',
 34: 'Health Insurance & Washington Legislation',
 35: 'Asian Business',
 36: 'Astrop

In [295]:
filename = 'models/14passes_265_topics_dict_sep8.pkl'
pickle.dump(topics_dict,open(filename,'wb'))

In [290]:
s = 'Pro-choice activists say that state lawmakers across the country are trying to restrict abortion at a pace not \
seen in decades. So what will this mean for a decades-long fight over the issue in America? \
On a Friday night, Julie gets ready to go out with her partner while her two boys curl up on the sofa \
to watch a Disney movie with their babysitter. \
It is a typical happy family scene, one that Julie probably never envisaged when, aged just 19, she was raped and \
took the decision to have an abortion. \
"I come from a small town in Ohio. All German Catholics, very conservative. So when I found out I was pregnant I \
panicked. I didnt know what to do. I knew that I could not have this baby," she says..'


s = "The vast majority of NFL previews attempt to predict what is most likely to happen in the upcoming season. I suspect \
you have read about 40 or 50 of them by now. Every year, just before the season begins, I like to drop a slightly different \
look at what's to come. Today, I'm going to give you an explanation of how each and every team in the NFL is capable of \
winning Super Bowl LVI"

topic_predict(s)

primary topic: Studies And Polls
---------- 
 other topics:
Sports
Sports/Esports


In [None]:

'''
stopwords to add: vice, games, tech, health, tech by vice
'''


'''
change intl to international
gene editing, genetic engineering -> genetic engineering
change, e.g. intl banking to banking
221 should be sports
clinical care should be clinical patient care
5 topics, only top 2 correct
148 should be public policy & political ideology
food allergies etc seems to take 1st place, precedence over better predicted topics
but did not come up when a paragraph was put in that specifically mentioned allergies.
puerto rico topic came up here:



Audio streaming & apps, 2nd: 61:
'i like music, any kind of music'

primary: 61. 2: studies & polls, 3: sports, 4: sports/esports
"The vast majority of NFL previews attempt to predict what is most likely to happen in the upcoming season. I suspect 
you've read about 40 or 50 of them by now. Every year, just before the season begins, I like to drop a slightly different 
look at what's to come. Today, I'm going to give you an explanation of how each and every team in the NFL is capable of 
winning Super Bowl LVI  \

bloodborne diseases & vaccines; 2nd: 61
At the time, COVID-19 didn’t exist, and many never would have imagined a virus like this ripping through the country like 
wildfire. Many of the illnesses we vaccinate against today are gone or mostly gone. We don’t see smallpox, polio or mumps 
ravaging our population, so it can be easy to forget the damage that these diseases can do, and to think that vaccinations 
are no longer a necessity. But COVID-19 changed that.

Melanie See’s first bout of odd symptoms began in 2005. Suddenly she started sweating a lot. She rapidly lost 10 pounds. 
She got dizzy walk ing from the bedroom to the couch. She started lactating even though she was not nursing a baby. After 
a slew of laboratory tests, See, then 45, was diagnosed with Graves’ disease, an autoimmune disorder that makes thyroid 
hormones surge.
“Young influencers on YouTube and social media have the money to live with less,” said Jenna Brown, a young YouTuber who 
makes videos about lifestyle trends. “They have the option to purposely live like they’re broke despite making a year’s 
salary every month.” Brown also illuminated the issue of minimalism being followed because of its aesthetic instead of 
its true purpose (intentional living). “A lot of the time, people who make videos about minimalism focus too much on 
looking like a minimalist instead of actually being one.”


Four years ago, when I was a 22-year-old college student in Virginia, I found out I was pregnant. I knew I was late, 
maybe a bit longer than a week, but that wasn’t unusual – I’d always had irregular periods. Like I had many times before, 
I picked up a test just in case. It came back positive.
I didn’t tell anyone at school I was pregnant, but I called my mom. She assured me it would be OK, and told me to schedule 
an appointment nearby, rather than drive home to New York. She was right; I probably didn’t need a seven-hour car ride of 
contemplative dread.
I never considered keeping it, and the why is not important. Regardless of my reasoning, know that no one wants to have an 
abortion. It is a decision made out of personal necessity. I still think about it sometimes, riddled with whispers of 
unexpected guilt that I wonder if all women experience. The one feeling I didn’t expect when thinking about my abortion 
was gratitude for its legality.


61 (second):
As a mum, I understand why parents would be tempted to give consent for their teen to get jabbed. After 16 months of 
doing the Covid Hokey Cokey – in out, in out – and watching happy, confident offspring grow anxious and despondent, 
it’s such a relief to see kids get back to normal at school this week. Anything, literally anything, to avoid having 
your child’s education disrupted again, you might think.

But you’d be wrong. Parents and children are being bribed, almost threatened, to agree to a treatment which Professor 
Adam Finn of the JCVI says the latest data, from paediatric cardiologists in the States, suggests may have long-term 
side effects. It’s outrageous. “If you don’t let your 13-year-old have the vaccine, and cases spike, we may have to 
shut the schools again.” That’s what they’re basically saying, but it’s simply not true. Other European countries didn’t 
close schools for as long as we did; some barely shut them at all. 

1st: food poisoning, allergies, household chemicals. 2nd: 221:
Concussions are mild traumatic brain injuries. They occur in a wide range of sports and affect all athletes, from 
professional players to little leaguers.
Sports concussion has become a significant problem. In recent years, it has made headlines with reports about the 
consequences of returning to play too soon, as well as research findings into the long-term effects of the injury.
Recognizing concussion and providing proper treatment is especially important for younger athletes because it typically 
takes them longer than adults to fully recover.
In addition, coaches, parents, and school administrators must be aware that concussion causes a wide range of symptoms 
and can interfere not only with sports participation, but with school and social relationships. Most athletes will 
fully recover from concussion, and understanding the varied symptoms can help with the healing process.


"Exercising is good for you, but sometimes you can injure yourself when you play sports or exercise. Accidents, poor \
training practices, or improper gear can cause them. Some people get hurt because they are not in shape. Not warming up \
or stretching enough can also lead to injuries. \
If you get hurt, stop playing. Continuing to play or exercise can cause more harm. Treatment often begins with the RICE \
(Rest, Ice, Compression, and Elevation) method to relieve pain, reduce swelling, and speed healing. Other possible \
treatments include pain relievers, keeping the injured area from moving, rehabilitation, and sometimes surgery."


1st: food poisoning, allergies, household chemicals. 2nd: fitness, exercise & diet

73 (Vice):
In recent days, images of the shiny, new warehouse complex emblazoned with a giant blue Amazon logo—and its impoverished environs with its unpaved roads and cardboard roofs have gone viral on social media, a stark display of globalization. 
Melanie See’s first bout of odd symptoms began in 2005. Suddenly she started sweating a lot. She rapidly lost 10 pounds. She got dizzy walk ing from the bedroom to the couch. She started lactating even though she was not nursing a baby. After a slew of laboratory tests, See, then 45, was diagnosed with Graves’ disease, an autoimmune disorder that makes thyroid hormones surge.
Joe Biden was supposed to be the man of the hour: a calming presence exuding decency, moderation and trust. As a candidate, he sold himself as a transitional president, a fatherly figure in the mold of George H.W. Bush who would restore dignity and prudence to the Oval Office after the mendacity and chaos that came before. It’s why I voted for him, as did so many others who once tipped red.
We find ourselves commemorating the first great jihadist victory over America, in 2001, right after delivering the second great jihadist victory over America, in 2021. The 9/11 memorial at the World Trade Center — water cascading into one void, and then trickling, out of sight, into another — has never felt more fitting.



primary topic: bacteria & viruses
---------- 
 other topics:
food poisoning, allergies, household chemicals

'"children could be set on a path to developing allergies, asthma and eczema before they are born. \
Analysis of a baby’s first stool, known as meconium, shows that a lack of certain biochemicals and gut \
bacteria normally seen in the faeces is linked with a higher risk of allergies and other conditions. \
Allergic conditions such as food allergies, hay fever, asthma and eczema are caused by the immune system \
overreacting to harmless compounds in the environment. Many studies have found links between such immune system \
reactivity and a lower diversity of gut bacteria, or microbiome. One idea is that a diverse ecosystem of \
beneficial bacteria helps to “train” the developing immune system to tolerate non-harmful compounds."'


primary topic: food poisoning, allergies, household chemicals
---------- 
 other topics:
apps, gadgets & devices

"APRIL showers bring May flowers, or so the saying goes. But those beautiful spring blooms – and their plentiful \
pollen – mean sneezing, runny noses and itchy eyes for many people. If this is you, and if you live in the UK, you can \
become a citizen sensor this spring by downloading the #BritainBreathing app and using it to record any allergy symptoms \
you develop. Doing so will help researchers learn more about when allergy symptoms are occurring at a population level \
and what the precise triggers are. Even if you don’t live in the UK, you can still download the app …"




148 (Vice):
More than a century ago, anti-vaccination groups also opposed vaccines for the sake of medical liberty and belief in 
alternative medicines. But they differed from groups today in several key ways.
We are a country that could not keep a demagogue from the White House; could not stop an insurrectionist mob from 
storming the Capitol; could not win (or at least avoid losing) a war against a morally and technologically 
retrograde enemy; cannot conquer a disease for which there are safe and effective vaccines; and cannot bring 
itself to trust the government, the news media, the scientific establishment, the police or any other institution 
meant to operate for the common good.

Also 148:
Infections are already forcing mass quarantines, and fear and high prevalence rates may further threaten 
in-person schooling once again — despite the indisputable evidence of the severe cost to kids.
How should schools adapt to the wide-ranging effects of the pandemic? How can they address the devastating inequality 
in American education that the pandemic both revealed and magnified? How do we help kids recover and thrive? We brought 
together six experts to explore these questions.

Also 148:

Corporate money has a powerful and malign influence on so many aspects of American life. But even by that low standard, 
events this week in a New York bankruptcy court are shocking. The legal system has effectively allowed one of the country’s 
richest families to buy its way out of accountability for what a White House commission called “America’s national 
nightmare” of mass opioid addiction.

Also 148:
Perhaps the form of Roe’s eventual downfall was a surprise. Few thought that Roe’s fatal case would be over Texas’s new 
abortion law, with its privatized enforcement system of bounty-hunting civil suits designed to elide judicial review. 
And among a sea of legal observers, only Cardozo law professor Kate Shaw seems to have predicted that the court would 
dispose of a long-established constitutional right in so rushed and perfunctory a proceeding as a late-night order on 
the shadow docket. But this outcome was never in doubt. Trump promised to appoint antichoice judges. He kept that promise. 
This week his three appointees – Neil Gorsuch, Brett Kavanaugh and Amy Coney Barrett, joined by Samuel Alito and 
Clarence Thomas – did what all of them know they were put on the court to do. They allowed the first state to outlaw 
abortion within its borders.

Also 148:
But if these are the politics of the plan, what about the policy? Does the announcement bring the UK, above all 
England (since health and care are largely devolved) any closer to having its social care problem solved? Not much, 
is the short answer, although that is not to say there is nothing of value in the new schemes. The commitment to 
raise standards is welcome, even if the £500m allocated for workforce improvements won’t be enough.

Also 148: 
This notion, that the only intelligent response to a threat to women’s rights is to be calm, blasé, and preemptively 
assured that nothing very bad or important will result, has been weaponized with particular insidiousness over the 
course of the abortion debate during the past five years. In the halls of power, contempt for abortion rights activists 
was nearly complete.

Also 148 (5th):
Recent decades have witnessed what Dennis Chong, a political scientist at the University of Southern California, 
describes in an email as “a demographic realignment of political tolerance in the U.S. that first became evident 
in the late 1980s-early 1990s. Before that, Chong pointed out, “the college educated, and younger generations, were 
among the most tolerant groups in the society of all forms of social and political nonconformity.” Since the 1990s, 
“these groups have become significantly less tolerant of hate speech pertaining to race, gender and social identities. 
Chong argued that “the expansion of equal rights for racial and ethnic minorities, women, L.G.B.T.Q. and other groups 
that have suffered discrimination has caused a re-evaluation of the harms of slurs and other derogatory expressions in 
professional and social life.


Also 148:
a loose constellation of ideas that is changing the way that mostly white, educated, left-leaning Americans view 
the world. This credo still lacks a definitive name: it is variously known as left-liberal identity politics, 
social-justice activism or, simply, wokeness.

Also 148:
'Germany’s long-simmering anger with the European Central Bank is again coming to the boil. It is hard to justify perennial bond purchases and negative rates when German inflation is near 4pc and rising, the highest since the Reunification boom in the early 1990s.

Political realities are forcing the ECB’s ultra-dovish governing council to prepare for bond tapering sooner than it wants – and sooner than it should, if you are a New Keynesian – in order to head off a bust-up with Europe’s anchor power.'

Also 148 (2nd: lgbtq+ issues):
"There is a difference between believing in “trans rights” and believing in “gender-identity ideology.” That’s the \
subtly important distinction that fuels Helen Joyce’s “Trans: When Ideology Meets Reality,” a book that offers an \
intelligent, thorough rejoinder to an idea that has swept across much of the liberal world seemingly overnight. \
According to Joyce, a longtime staffer at The Economist, most people “understand the call for ‘trans rights’ to mean \
compassionate concessions that enable a suffering minority to live full lives, in safety and dignity.” Joyce endorses \
this idea. Her bête noire is what she calls gender-identity ideology, which holds that everyone has a “gender identity,” \
an internal sense of being male or female (or both or neither), that is, in most tellings, innate and immutable, \
“something like a sexed soul.” When someone’s gender identity conflicts with their body, and/or with how society views \
their body, that person is transgender."




152,216,global big business,158
Ocasio-Cortez (left) spoke out in an interview with CNN on Tuesday, slamming Abbott (right) as ignorant of reproductive science but appearing to trip over her own phrasing. 'I don't know if he is familiar with a menstruating person's body. In fact, I do know that he's not familiar with a woman -- with a female or menstruating person's body,'
152:
A civilization “is born stoic and dies epicurean,” wrote historian Will Durant about the Babylonians. Our civilization was born optimistic and enlightened, at least by the standards of the day. Now it feels as if it’s fading into paranoid senility.
152:
Does marijuana cause psychotic disorders such as schizophrenia, and do associated symptoms like paranoia lead to violent crimes? That’s what writer Alex Berenson is claiming. As part of his new book promotion, Berenson published a New York Times op-ed that also blames the drug for “sharp increases in murders and aggravated assaults” purportedly observed in some states that allow adult recreational marijuana use.


video games3,employment issues,litigation & court rulings
The American economy runs on poverty, or at least the constant threat of it. Americans like their goods cheap and their services plentiful and the two of them, together, require a sprawling labor force willing to work tough jobs at crummy wages. On the right, the barest glimmer of worker power is treated as a policy emergency, and the whip of poverty, not the lure of higher wages, is the appropriate response.

clinical care, video games3:
By the time we were sent into the June lockdown, the gaping holes in our health system came to haunt us. While we have always complained about the poor state of the hospitals and the lack of investment, the pandemic ripped the sticking plaster off our health sector. Those of us able to access private healthcare did not raise our voices enough for better state-run health facilities.


mid-east politics, terrorism:
"A religious leader was stopped at the Ahilyabai Holkar airport in India's Madhya Pradesh state for carrying a human skull and bones in her luggage, authorities said. Sadhvi Yogmata Sachdeva, a resident of neighbouring Ujjain, was booked to board a Vistara Airlines flight to Delhi when she was found with the human remains. During luggage scanning, security personnel asked her to open her bag and, to their shock, the skull was discovered inside. When interrogated by the Central Industrial Security Force (CISF) and Aerodrome police station’s officer, the woman said she was taking the ashes of a fellow monk for immersion in Haridwar, a city by the Ganges famous among Hindu pilgrims."

'''

In [223]:
n = 73

test_df = df_ht[df_ht['lda_topic'] == n]
print(f'{len(test_df)} records:','\n','*' * 10)
for i in range(550):
    try:      
        print(test_df.iloc[i,8])
        print(test_df.iloc[i,11])
        print('-' * 15)
    except:
        break

1194 records: 
 **********
Canada’s Favorite ‘Authentic’ ‘Pasta’ ‘Experience’ Hacked With ‘Military Grade Algorithms’
Tech by VICE
---------------
Springing to Life with VT Pro Design's Kinetic Wall
Tech by VICE
---------------
SWARM
Tech by VICE
---------------
Statue Celebrating Doctor Who Experimented on Slaves Moved From Central Park
Health
---------------
This GeoCities Advent Calendar Is a Christmas Miracle
Tech by VICE
---------------
The Portal
Tech by VICE
---------------
I Never Feel at Home in a New Place Until I Have These Three Things
Games
---------------
Terror, Shipwreck, Guns: 24 Hours in a Karachi Ambulance
Health
---------------
Humans Battle Electricity in a Williamsburg Brownstone Exhibit
Tech by VICE
---------------
Why Did Police Kill an Alleged Small-Time Hacker?
Tech by VICE
---------------
​72 Hours of Pwnage: A Paranoid N00b Goes to Def Con
Tech by VICE
---------------
A Second Opinion
Tech by VICE
---------------
A UFO Sighting Expert Shows Us His Most Convi