In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import gensim
import gensim.downloader as api
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import os

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
disaster_df=pd.read_csv('..\data\disaster\\all_disasters.csv')

### Title Processing

In [4]:
### given a dataframe with a 'title' feature, returns a new df consisting of the title and a list of its tokens.

def process_titles(dataframe):
    titles=dataframe['title'].values
    titles=[remove_stopwords(title) for title in titles]
    titles=[tokenize(title, deacc="True", lowercase="True") for title in titles]
    titles_df=dataframe[['title']]
    titles_df['processed']=[list(gen) for gen in titles]
    return titles_df
    

In [32]:
def process_body(dataframe):
    text_body=dataframe['body'].values
    text_body=[remove_stopwords(body) for body in text_body]
    text_body=[tokenize(body, deacc="True", lowercase="True") for body in text_body]
    body_df=dataframe[['body']]
    body_df['processed']=[list(gen) for gen in text_body]
    return body_df

In [None]:
#takes a topic probability output and returns the topic with max probability.
def probs_to_topic(probs):
    assigned_topic=-1
    max_prob=0
    for topic, prob in probs:
        if prob > max_prob:
            assigned_topic=topic
            max_prob=prob
    return assigned_topic

# LDA Training

First run: Just doing this on the data I mined. 10 categories.

In [50]:
X=disaster_titles['processed']
disaster_corpus=[wiki_dict.doc2bow(text) for text in X]

In [79]:
lda = LdaModel(corpus=disaster_corpus, num_topics=10, id2word=wiki_dict, passes=10, iterations=10)

In [80]:
topics=[probs_to_topic(lda[title]) for title in disaster_corpus]
disaster_titles['topic']=topics

In [82]:
disaster_titles['topic'].value_counts()

8    1176
6     508
7     448
9     269
4     207
5     197
3     195
2      97
0      92
1      67
Name: topic, dtype: int64

In [52]:
#takes a topic probability output and returns the topic with max probability.
def probs_to_topic(probs):
    assigned_topic=-1
    max_prob=0
    for topic, prob in probs:
        if prob > max_prob:
            assigned_topic=topic
            max_prob=prob
    return assigned_topic

That didn't perform great. It might need more training. Let's try on body text.

In [10]:
X=disaster_body['processed']
body_corpus=[wiki_dict.doc2bow(text) for text in X]

In [51]:
lda = LdaModel(corpus=body_corpus, num_topics=6, id2word=wiki_dict, passes=25, iterations=10)

2020-10-25 22:16:07,240 : INFO : using symmetric alpha at 0.16666666666666666
2020-10-25 22:16:07,241 : INFO : using symmetric eta at 0.16666666666666666
2020-10-25 22:16:07,259 : INFO : using serial LDA version on this node
2020-10-25 22:16:07,336 : INFO : running online (multi-pass) LDA training, 6 topics, 10 passes over the supplied corpus of 3256 documents, updating model once every 2000 documents, evaluating perplexity every 3256 documents, iterating 10x with a convergence threshold of 0.001000
2020-10-25 22:16:07,337 : INFO : PROGRESS: pass 0, at document #2000/3256
2020-10-25 22:16:07,835 : INFO : merging changes from 2000 documents into a model of 3256 documents
2020-10-25 22:16:07,889 : INFO : topic #3 (0.167): 0.010*"said" + 0.005*"storm" + 0.005*"firefighters" + 0.004*"earthquake" + 0.004*"fire" + 0.003*"magnitude" + 0.003*"we" + 0.003*"fires" + 0.003*"weather" + 0.003*"according"
2020-10-25 22:16:07,891 : INFO : topic #0 (0.167): 0.017*"said" + 0.006*"earthquake" + 0.006*"s

2020-10-25 22:16:12,483 : INFO : topic #0 (0.167): 0.021*"tornado" + 0.020*"said" + 0.008*"damage" + 0.005*"tornadoes" + 0.004*"storm" + 0.004*"we" + 0.003*"weather" + 0.003*"ef" + 0.003*"hit" + 0.003*"homes"
2020-10-25 22:16:12,485 : INFO : topic #4 (0.167): 0.006*"said" + 0.004*"earthquake" + 0.003*"firefighters" + 0.003*"game" + 0.003*"earthquakes" + 0.003*"storm" + 0.003*"magnitude" + 0.002*"news" + 0.002*"fire" + 0.002*"hurricane"
2020-10-25 22:16:12,487 : INFO : topic #1 (0.167): 0.018*"hurricane" + 0.014*"said" + 0.010*"storm" + 0.009*"delta" + 0.007*"louisiana" + 0.005*"laura" + 0.005*"winds" + 0.004*"fire" + 0.004*"friday" + 0.004*"landfall"
2020-10-25 22:16:12,492 : INFO : topic diff=0.249860, rho=0.464840
2020-10-25 22:16:12,494 : INFO : PROGRESS: pass 3, at document #2000/3256
2020-10-25 22:16:12,998 : INFO : merging changes from 2000 documents into a model of 3256 documents
2020-10-25 22:16:13,042 : INFO : topic #2 (0.167): 0.017*"storm" + 0.012*"hurricane" + 0.010*"weathe

2020-10-25 22:16:17,586 : INFO : topic #1 (0.167): 0.020*"hurricane" + 0.013*"said" + 0.012*"delta" + 0.009*"louisiana" + 0.008*"storm" + 0.006*"laura" + 0.005*"fire" + 0.004*"landfall" + 0.004*"gulf" + 0.004*"coast"
2020-10-25 22:16:17,589 : INFO : topic #2 (0.167): 0.021*"storm" + 0.015*"hurricane" + 0.013*"weather" + 0.009*"winds" + 0.009*"said" + 0.009*"storms" + 0.009*"mph" + 0.007*"tropical" + 0.006*"rain" + 0.006*"center"
2020-10-25 22:16:17,591 : INFO : topic #5 (0.167): 0.024*"earthquake" + 0.012*"magnitude" + 0.009*"said" + 0.008*"quake" + 0.006*"felt" + 0.006*"earthquakes" + 0.004*"miles" + 0.004*"tsunami" + 0.004*"reported" + 0.004*"according"
2020-10-25 22:16:17,593 : INFO : topic #3 (0.167): 0.007*"hurricanes" + 0.007*"game" + 0.006*"said" + 0.005*"miami" + 0.004*"storm" + 0.004*"play" + 0.003*"yards" + 0.003*"we" + 0.003*"games" + 0.003*"week"
2020-10-25 22:16:17,598 : INFO : topic diff=0.142054, rho=0.362072
2020-10-25 22:16:17,600 : INFO : PROGRESS: pass 6, at document

2020-10-25 22:16:22,897 : INFO : topic #4 (0.167): 0.005*"san" + 0.005*"jose" + 0.004*"earthquakes" + 0.003*"goal" + 0.002*"firefighters" + 0.002*"minute" + 0.002*"vancouver" + 0.002*"scored" + 0.002*"seattle" + 0.002*"portland"
2020-10-25 22:16:22,899 : INFO : topic #2 (0.167): 0.022*"storm" + 0.014*"hurricane" + 0.014*"weather" + 0.010*"winds" + 0.009*"storms" + 0.009*"mph" + 0.009*"said" + 0.008*"tropical" + 0.007*"rain" + 0.006*"center"
2020-10-25 22:16:22,901 : INFO : topic #0 (0.167): 0.020*"tornado" + 0.019*"said" + 0.008*"damage" + 0.004*"we" + 0.004*"tornadoes" + 0.004*"firefighters" + 0.003*"storm" + 0.003*"ef" + 0.003*"fire" + 0.003*"help"
2020-10-25 22:16:22,904 : INFO : topic #1 (0.167): 0.022*"hurricane" + 0.013*"said" + 0.013*"delta" + 0.010*"louisiana" + 0.008*"storm" + 0.007*"laura" + 0.005*"landfall" + 0.005*"fire" + 0.004*"coast" + 0.004*"article"
2020-10-25 22:16:22,906 : INFO : topic #5 (0.167): 0.027*"earthquake" + 0.014*"magnitude" + 0.009*"quake" + 0.008*"said" 

In [53]:
topics=[probs_to_topic(lda[text]) for text in body_corpus]
disaster_body['topic']=topics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disaster_body['topic']=topics


In [54]:
disaster_body['topic'].value_counts()

0    1316
1     558
2     506
5     428
3     368
4      80
Name: topic, dtype: int64

In [55]:
disaster_df['topic']=topics

In [60]:
for title in disaster_df[disaster_df['topic']==3]['title']:
    print(title+ "\n" + "=============="+"\n")

Remembrance Day tribute sees poppies appear on West Sussex fire engines

Mayor awaits facts on allegations of racist firefighters

Remembrance Day tribute sees poppies appear on West Sussex fire engines

Mayor awaits facts on allegations of racist firefighters

Baby Yoda Toy Gifted To Wildfire Firefighters Travels Back To California  CBS Sacramento

C.A.R.E.

Around the world in 80 seconds: Violent protests in Bangkok, Fires at Mt. Kilimanjaro, a floating house in Japan & more

Florida fires Will Muschamp

5-year-old donates Baby Yoda doll to firefighters battling wildfires  and it's a hit among crews

Boy, 5, Sends Baby Yoda Doll to Front Line Oregon Firefighters: In Case You Get Lonely

Fires in Vacaville and South San Francisco Rattle Nerves Friday  NBC Bay Area

Firefighters destroy $90,000 Mustang Shelby GT500 ... here's why

Firefighters walk miles for Hastings mum with months to live

Prince Georges Countys newest firefighters trained through pandemic

Multi-Alarm Fire in Cambri

In [None]:
#more data:
#https://www.kaggle.com/vstepanenko/disaster-tweets
#start with this, remove the "non-disaster" tweets, only take tweets from news sources

In [3]:
tweets_df=pd.read_csv('..\data\kaggle\\tweets.csv')

In [6]:
mask=tweets_df['target']==0

tweets_df.drop(tweets_df[mask].index, inplace=True)

In [9]:
tweets_df.drop(columns=['target', 'id', 'location'], inplace=True)

In [12]:
tweets_df['keyword'].unique()

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse', 'army',
       'arson', 'arsonist', 'attack', 'attacked', 'avalanche',
       'bioterror', 'bioterrorism', 'blaze', 'bleeding', 'blew%20up',
       'blight', 'blizzard', 'blood', 'bloody', 'body%20bag',
       'body%20bagging', 'body%20bags', 'bomb', 'bombed', 'bombing',
       'bridge%20collapse', 'buildings%20burning',
       'buildings%20on%20fire', 'burned', 'burning', 'bush%20fires',
       'casualties', 'casualty', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crushed', 'curfew', 'cyclone', 'damage',
       'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'demolish', 'demolished', 'derail', 'derailed', 'derailment',
       'desolate', 'destroyed', 'destruction', 'detonate', 'devastated',
       'devastation', 'disaster', 'displaced', 'drought',

In [16]:
#focusing on natural disasters
unwanted=['airplane%20accident', 'annihilated', 'annihilation', 'apocalypse',
         'bioterror', 'bioterrorism', 'bomb', 'bombed', 'bombing', 'detonate', 'hijack', 'hijacker',
         'hijacking', 'hostage', 'hostages', 'mass%20murder',
       'mass%20murderer', 'massacre', 'riot',
       'rioting','outbreak', 'quarantine', 'suicide%20bomb', 'suicide%20bomber', 'suicide%20bombing', 'upheaval',
         'war%20zone','weapon', 'weapons']
mask=tweets_df['keyword'].isin(unwanted)
tweets_df.drop(tweets_df[mask].index, inplace=True)

## Final Topic Model Training

I'm doing this using the aylien disaster news dataset because I trust it more than the stuff I mined. It's big enough to build a full dictionary off of. Let's include stemming in our preprocessing now.

In [37]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import re

In [21]:
aylien_df=pd.read_json('../data/aylien/natural_disasters_aylien_news_data.jsonl', lines=True)

In [25]:
aylien_df.to_csv('../data/aylien/natural_disasters_aylien_news_data.csv')

In [29]:
X=aylien_df[['body']]

In [34]:
X = process_body(X)

In [38]:
snow=SnowballStemmer("english")

In [41]:
X['stemmed']=[[snow.stem(token) for token in word_list] for word_list in X['processed']]

In [43]:
X.to_csv('../data/aylien/aylien_body_processed.csv')

In [8]:
X=pd.read_csv('..\..\..\..\Local Data\project_5_data\\aylien\\aylien_body_processed.csv')

In [44]:
dictionary=Dictionary(X['stemmed'])

In [45]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [1]:
corpus = [dictionary.doc2bow(doc) for doc in X['stemmed']]

NameError: name 'X' is not defined

In [55]:
lda = LdaModel(corpus=corpus, num_topics=6, id2word=dictionary, passes=20, iterations=20)

In [56]:
lda.print_topics()

[(0,
  '0.020*"water" + 0.013*"year" + 0.009*"climat" + 0.007*"chang" + 0.006*"it" + 0.006*"drought" + 0.006*"govern" + 0.005*"flood" + 0.005*"citi" + 0.005*"level"'),
 (1,
  '0.027*"fire" + 0.011*"burn" + 0.010*"firefight" + 0.009*"australia" + 0.009*"bushfir" + 0.008*"south" + 0.008*"home" + 0.008*"state" + 0.008*"condit" + 0.007*"temperatur"'),
 (2,
  '0.029*"earthquak" + 0.017*"magnitud" + 0.015*"quak" + 0.013*"report" + 0.012*"a" + 0.011*"mile" + 0.011*"damag" + 0.010*"erupt" + 0.009*"island" + 0.008*"hit"'),
 (3,
  '0.013*"i" + 0.007*"it" + 0.007*"island" + 0.006*"we" + 0.006*"home" + 0.006*"t" + 0.005*"famili" + 0.005*"polic" + 0.005*"a" + 0.005*"hous"'),
 (4,
  '0.024*"storm" + 0.022*"flood" + 0.017*"hurrican" + 0.016*"rain" + 0.012*"weather" + 0.012*"wind" + 0.011*"warn" + 0.010*"dorian" + 0.007*"south" + 0.007*"expect"'),
 (5,
  '0.021*"flood" + 0.014*"district" + 0.013*"rain" + 0.011*"state" + 0.011*"water" + 0.009*"heavi" + 0.008*"india" + 0.007*"offici" + 0.007*"affect" + 

In [57]:
topics=[probs_to_topic(lda[text]) for text in corpus]
X['topic']=topics

In [58]:
X['topic'].value_counts()

5    6497
4    6058
3    5860
0    5404
2    5306
1    5100
Name: topic, dtype: int64

In [71]:
for body in X[X['topic']==3]['body'].head(30):
    print(body+ "\n" + "=============="+"\n")

Hawaii is marking the first anniversary of one of the largest and most destructive volcanic eruptions in its history. Lava flowed from the Kilauea Volcano for months, wiping out more than 700 homes on the Big Island. The eruption produced enough lava to cover a two-lane highway stretching from Boston to Seattle – and it would be more than 70 feet high.

The molten rock that consumed much of Leilani Estates covers almost 14 square miles and at some points it is 100 feet thick. The lava has long hardened but, as CBS News' Carter Evans learned, the disaster has also hardened the resolve of the people living there.

For four months, the eruption of Hawaii's Kilauea volcano offered a spectacular combination of beauty and destruction. The most hard-hit neighborhood was Leilani Estates, where resident Stacy Welch watched lava from fissure 8 pour onto her property. At one point it fountained lava about 250 feet in the air.

Fissure 8 was one of the most active of 24 lava-filled cracks in the g

Unnamed: 0,body,processed,stemmed,topic
0,"Dharwad: With water sources getting dry, Dharw...","[dharwad, with, water, sources, getting, dry, ...","[dharwad, with, water, sourc, get, dri, dharwa...",0
1,Hawaii is marking the first anniversary of one...,"[hawaii, marking, anniversary, largest, destru...","[hawaii, mark, anniversari, largest, destruct,...",3
2,"LEILANI ESTATES, Hawaii — A year after a volca...","[leilani, estates, hawaii, a, year, volcano, h...","[leilani, estat, hawaii, a, year, volcano, haw...",3
3,"MOSCOW (UrduPoint News / Sputnik - 25th May, 2...","[moscow, urdupoint, news, sputnik, th, may, th...","[moscow, urdupoint, news, sputnik, th, may, th...",5
4,Cyclone Fani has devastated the state of Odish...,"[cyclone, fani, devastated, state, odisha, dam...","[cyclon, fani, devast, state, odisha, damag, n...",5
...,...,...,...,...
34220,Parts of Darwin's central business district ha...,"[parts, darwin, s, central, business, district...","[part, darwin, s, central, busi, district, eva...",2
34221,Severe thunderstorms have dumped hail and wind...,"[severe, thunderstorms, dumped, hail, winds, b...","[sever, thunderstorm, dump, hail, wind, brough...",1
34222,With dams and creeks bone dry in drought-stric...,"[with, dams, creeks, bone, dry, drought, stric...","[with, dam, creek, bone, dri, drought, stricke...",1
34223,Roofs have been torn off buildings and thrown ...,"[roofs, torn, buildings, thrown, half, kilomet...","[roof, torn, build, thrown, half, kilometr, aw...",3


In [74]:
lda.save("..\gensim_data\\trained_model.tmp")

In [75]:
dictionary.save("..\gensim_data\\dictionary.tmp")