In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.stem.snowball import SnowballStemmer


# Model

In [11]:
trained_model=LdaModel.load('../gensim_data/trained_model.tmp')
dictionary=Dictionary.load('../gensim_data/Dictionary.tmp')
X=pd.read_csv('..\..\..\..\Local Data\project_5_data\\aylien\\aylien_body_processed.csv')

In [12]:
X

Unnamed: 0.1,Unnamed: 0,body,processed,stemmed
0,0,"Dharwad: With water sources getting dry, Dharw...","['dharwad', 'with', 'water', 'sources', 'getti...","['dharwad', 'with', 'water', 'sourc', 'get', '..."
1,1,Hawaii is marking the first anniversary of one...,"['hawaii', 'marking', 'anniversary', 'largest'...","['hawaii', 'mark', 'anniversari', 'largest', '..."
2,2,"LEILANI ESTATES, Hawaii — A year after a volca...","['leilani', 'estates', 'hawaii', 'a', 'year', ...","['leilani', 'estat', 'hawaii', 'a', 'year', 'v..."
3,3,"MOSCOW (UrduPoint News / Sputnik - 25th May, 2...","['moscow', 'urdupoint', 'news', 'sputnik', 'th...","['moscow', 'urdupoint', 'news', 'sputnik', 'th..."
4,4,Cyclone Fani has devastated the state of Odish...,"['cyclone', 'fani', 'devastated', 'state', 'od...","['cyclon', 'fani', 'devast', 'state', 'odisha'..."
...,...,...,...,...
34220,34220,Parts of Darwin's central business district ha...,"['parts', 'darwin', 's', 'central', 'business'...","['part', 'darwin', 's', 'central', 'busi', 'di..."
34221,34221,Severe thunderstorms have dumped hail and wind...,"['severe', 'thunderstorms', 'dumped', 'hail', ...","['sever', 'thunderstorm', 'dump', 'hail', 'win..."
34222,34222,With dams and creeks bone dry in drought-stric...,"['with', 'dams', 'creeks', 'bone', 'dry', 'dro...","['with', 'dam', 'creek', 'bone', 'dri', 'droug..."
34223,34223,Roofs have been torn off buildings and thrown ...,"['roofs', 'torn', 'buildings', 'thrown', 'half...","['roof', 'torn', 'build', 'thrown', 'half', 'k..."


In [4]:
snow=SnowballStemmer("english")

In [5]:
#given a probability prediction from the LDA model of form
#[(topic1, prob1), (topic2, prob2),...]  
#returns the topic number with the the highest assigned probabiltiy.

def probs_to_topic(probs):
    assigned_topic=-1
    max_prob=0
    for topic, prob in probs:
        if prob > max_prob:
            assigned_topic=topic
            max_prob=prob
    return assigned_topic

**Topic Index Reference**
(These are not exact rules, and are topics classified by the unsupervised trained LDA model)

0: Global Warming/Drought/Climate disasters.

1: Fires

2: Earthquakes/Volcanos/Seismic Events

3: Urban/Other (This is a weird one -- I think here were lots of airline accidents in the training data, and any article that talks about the urban ramifications of a disaster tends to get sorted here.).

4: Storms/Hurricanes

5: Floods/Rains


In [6]:
trained_model.print_topics()

[(0,
  '0.020*"water" + 0.013*"year" + 0.009*"climat" + 0.007*"chang" + 0.006*"it" + 0.006*"drought" + 0.006*"govern" + 0.005*"flood" + 0.005*"citi" + 0.005*"level"'),
 (1,
  '0.027*"fire" + 0.011*"burn" + 0.010*"firefight" + 0.009*"australia" + 0.009*"bushfir" + 0.008*"south" + 0.008*"home" + 0.008*"state" + 0.008*"condit" + 0.007*"temperatur"'),
 (2,
  '0.029*"earthquak" + 0.017*"magnitud" + 0.015*"quak" + 0.013*"report" + 0.012*"a" + 0.011*"mile" + 0.011*"damag" + 0.010*"erupt" + 0.009*"island" + 0.008*"hit"'),
 (3,
  '0.013*"i" + 0.007*"it" + 0.007*"island" + 0.006*"we" + 0.006*"home" + 0.006*"t" + 0.005*"famili" + 0.005*"polic" + 0.005*"a" + 0.005*"hous"'),
 (4,
  '0.024*"storm" + 0.022*"flood" + 0.017*"hurrican" + 0.016*"rain" + 0.012*"weather" + 0.012*"wind" + 0.011*"warn" + 0.010*"dorian" + 0.007*"south" + 0.007*"expect"'),
 (5,
  '0.021*"flood" + 0.014*"district" + 0.013*"rain" + 0.011*"state" + 0.011*"water" + 0.009*"heavi" + 0.008*"india" + 0.007*"offici" + 0.007*"affect" + 

## Preprocessing/Prediction

In [7]:
#input : for lack of a better assumption, let's assume that the input will be a dataframe that has one article per row,
#        and a feature named "body" of it's unprocessed body text as a string.
#        this could include title text as well, but didn't want to put too many assumptions on the input

#output: the same dataframe with three columns appended: token list, corpus (where the corpus is the token ids), 
##and predicted category

def body_topic(dataframe):
    text_body=dataframe['body'].values
    text_body=[remove_stopwords(body) for body in text_body]
    text_body=[tokenize(body, deacc="True", lowercase="True") for body in text_body]
    text_body=[[snow.stem(token) for token in word_list] for word_list in text_body]
    body_df=dataframe[['body']]
    dataframe['tokens']=[list(gen) for gen in text_body]
    dataframe['corpus']=[dictionary.doc2bow(doc) for doc in dataframe['tokens']]
    dataframe['predicted_topic']= [probs_to_topic(topic_probs)for topic_probs in trained_model.get_document_topics(dataframe['corpus'])]
    return dataframe

In [9]:
X

Unnamed: 0.1,Unnamed: 0,body,processed,stemmed
0,0,"Dharwad: With water sources getting dry, Dharw...","['dharwad', 'with', 'water', 'sources', 'getti...","['dharwad', 'with', 'water', 'sourc', 'get', '..."
1,1,Hawaii is marking the first anniversary of one...,"['hawaii', 'marking', 'anniversary', 'largest'...","['hawaii', 'mark', 'anniversari', 'largest', '..."
2,2,"LEILANI ESTATES, Hawaii — A year after a volca...","['leilani', 'estates', 'hawaii', 'a', 'year', ...","['leilani', 'estat', 'hawaii', 'a', 'year', 'v..."
3,3,"MOSCOW (UrduPoint News / Sputnik - 25th May, 2...","['moscow', 'urdupoint', 'news', 'sputnik', 'th...","['moscow', 'urdupoint', 'news', 'sputnik', 'th..."
4,4,Cyclone Fani has devastated the state of Odish...,"['cyclone', 'fani', 'devastated', 'state', 'od...","['cyclon', 'fani', 'devast', 'state', 'odisha'..."
...,...,...,...,...
34220,34220,Parts of Darwin's central business district ha...,"['parts', 'darwin', 's', 'central', 'business'...","['part', 'darwin', 's', 'central', 'busi', 'di..."
34221,34221,Severe thunderstorms have dumped hail and wind...,"['severe', 'thunderstorms', 'dumped', 'hail', ...","['sever', 'thunderstorm', 'dump', 'hail', 'win..."
34222,34222,With dams and creeks bone dry in drought-stric...,"['with', 'dams', 'creeks', 'bone', 'dry', 'dro...","['with', 'dam', 'creek', 'bone', 'dri', 'droug..."
34223,34223,Roofs have been torn off buildings and thrown ...,"['roofs', 'torn', 'buildings', 'thrown', 'half...","['roof', 'torn', 'build', 'thrown', 'half', 'k..."


In [10]:
process_body(X.head(50))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['tokens']=[list(gen) for gen in text_body]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['corpus']=[dictionary.doc2bow(doc) for doc in dataframe['tokens']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['predicted_topic']= [probs_to_topic(topic_probs)for topic_probs i

Unnamed: 0.1,Unnamed: 0,body,processed,stemmed,tokens,corpus,predicted_topic
0,0,"Dharwad: With water sources getting dry, Dharw...","['dharwad', 'with', 'water', 'sources', 'getti...","['dharwad', 'with', 'water', 'sourc', 'get', '...","[dharwad, with, water, sourc, get, dri, dharwa...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0
1,1,Hawaii is marking the first anniversary of one...,"['hawaii', 'marking', 'anniversary', 'largest'...","['hawaii', 'mark', 'anniversari', 'largest', '...","[hawaii, mark, anniversari, largest, destruct,...","[(25, 1), (46, 3), (50, 1), (54, 2), (56, 1), ...",3
2,2,"LEILANI ESTATES, Hawaii — A year after a volca...","['leilani', 'estates', 'hawaii', 'a', 'year', ...","['leilani', 'estat', 'hawaii', 'a', 'year', 'v...","[leilani, estat, hawaii, a, year, volcano, haw...","[(19, 4), (37, 1), (41, 2), (44, 3), (46, 2), ...",3
3,3,"MOSCOW (UrduPoint News / Sputnik - 25th May, 2...","['moscow', 'urdupoint', 'news', 'sputnik', 'th...","['moscow', 'urdupoint', 'news', 'sputnik', 'th...","[moscow, urdupoint, news, sputnik, th, may, th...","[(40, 1), (58, 1), (72, 1), (86, 1), (91, 3), ...",5
4,4,Cyclone Fani has devastated the state of Odish...,"['cyclone', 'fani', 'devastated', 'state', 'od...","['cyclon', 'fani', 'devast', 'state', 'odisha'...","[cyclon, fani, devast, state, odisha, damag, n...","[(1, 1), (2, 1), (6, 1), (29, 2), (55, 2), (58...",5
5,5,Image caption\n \n ...,"['image', 'caption', 'thousands', 'saplings', ...","['imag', 'caption', 'thousand', 'sapl', 'plant...","[imag, caption, thousand, sapl, plant, hs, die...","[(1, 1), (33, 1), (45, 3), (69, 1), (90, 2), (...",0
6,6,The acute water crisis raging across the state...,"['the', 'acute', 'water', 'crisis', 'raging', ...","['the', 'acut', 'water', 'crisi', 'rage', 'sta...","[the, acut, water, crisi, rage, state, summer,...","[(0, 1), (17, 3), (19, 1), (20, 1), (22, 1), (...",5
7,7,Make sense of the 2019 Lok Sabha elections a...,"['make', 'sense', 'lok', 'sabha', 'elections',...","['make', 'sens', 'lok', 'sabha', 'elect', 'res...","[make, sens, lok, sabha, elect, result, may, t...","[(1, 1), (2, 1), (14, 2), (19, 2), (22, 1), (2...",5
8,8,"CHENNAI, India, May 20 (Thomson Reuters Founda...","['chennai', 'india', 'may', 'thomson', 'reuter...","['chennai', 'india', 'may', 'thomson', 'reuter...","[chennai, india, may, thomson, reuter, foundat...","[(2, 1), (8, 1), (18, 1), (19, 2), (29, 2), (3...",3
9,9,A hurricane is a large rotating storm that for...,"['a', 'hurricane', 'large', 'rotating', 'storm...","['a', 'hurrican', 'larg', 'rotat', 'storm', 'f...","[a, hurrican, larg, rotat, storm, form, tropic...","[(1, 1), (2, 2), (10, 1), (14, 1), (17, 1), (2...",0


In [13]:
pip freeze

absl-py==0.10.0
anaconda-client==1.7.2
anaconda-navigator==1.9.12
argon2-cffi @ file:///C:/ci/argon2-cffi_1596828585465/work
astunparse==1.6.3
async-generator==1.10
Note: you may need to restart the kernel to use updated packages.attrs @ file:///tmp/build/80754af9/attrs_1600298409949/work
backcall==0.2.0
backports.functools-lru-cache==1.6.1
backports.tempfile==1.0





backports.weakref==1.0.post1
beautifulsoup4 @ file:///tmp/build/80754af9/beautifulsoup4_1601924105527/work
bleach @ file:///tmp/build/80754af9/bleach_1600439572647/work
blis==0.4.1
bokeh==2.2.1
brotlipy==0.7.0
cachetools==4.1.1
catalogue==1.0.0
certifi==2020.6.20
cffi @ file:///C:/ci/cffi_1600699246375/work
chardet==3.0.4
click==7.1.2
clyent==1.2.2
colorama @ file:///tmp/build/80754af9/colorama_1603211150991/work
conda==4.9.1
conda-build==3.20.4
conda-package-handling @ file:///C:/ci/conda-package-handling_1603003327818/work
conda-verify==3.4.2
cryptography @ file:///C:/ci/cryptography_1601046905460/work
cupy-cuda101==8.0.0
cycler==0.10.0
cymem==2.0.3
Cython==0.29.14
decorator==4.4.2
defusedxml==0.6.0
entrypoints==0.3
fastrlock==0.5
filelock==3.0.12
Flask==1.1.2
fsspec @ file:///tmp/build/80754af9/fsspec_1602684995936/work
future==0.18.2
gast==0.3.3
gensim==3.8.3
glob2==0.7
google-auth==1.22.1
google-auth-oauthlib==0.4.1
google-pasta==0.2.0
grpcio==1.32.0
h5py==2.10.0
idna @ file:///t