In [24]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import re
from ast import literal_eval

# Model

In [7]:
trained_model=LdaModel.load('../gensim_data/trained_model.tmp')
dictionary=Dictionary.load('../gensim_data/Dictionary.tmp')
X=pd.read_csv('..\..\..\..\Local Data\project_5_data\\aylien\\aylien_body_processed.csv')

In [32]:
snow=SnowballStemmer("english")

In [31]:
#given a probability prediction from the LDA model of form
#[(topic1, prob1), (topic2, prob2),...]  
#returns the topic number with the the highest assigned probabiltiy.

def probs_to_topic(probs):
    assigned_topic=-1
    max_prob=0
    for topic, prob in probs:
        if prob > max_prob:
            assigned_topic=topic
            max_prob=prob
    return assigned_topic

**Topic Index Reference**
(These are not exact rules, and are topics classified by the unsupervised trained LDA model)

0: Global Warming/Drought/Climate disasters.

1: Fires

2: Earthquakes/Volcanos/Seismic Events

3: Urban/Other (This is a weird one -- I think here were lots of airline accidents in the training data, and any article that talks about the urban ramifications of a disaster tends to get sorted here.).

4: Storms/Hurricanes

5: Floods/Rains


In [44]:
trained_model.print_topics()

[(0,
  '0.020*"water" + 0.013*"year" + 0.009*"climat" + 0.007*"chang" + 0.006*"it" + 0.006*"drought" + 0.006*"govern" + 0.005*"flood" + 0.005*"citi" + 0.005*"level"'),
 (1,
  '0.027*"fire" + 0.011*"burn" + 0.010*"firefight" + 0.009*"australia" + 0.009*"bushfir" + 0.008*"south" + 0.008*"home" + 0.008*"state" + 0.008*"condit" + 0.007*"temperatur"'),
 (2,
  '0.029*"earthquak" + 0.017*"magnitud" + 0.015*"quak" + 0.013*"report" + 0.012*"a" + 0.011*"mile" + 0.011*"damag" + 0.010*"erupt" + 0.009*"island" + 0.008*"hit"'),
 (3,
  '0.013*"i" + 0.007*"it" + 0.007*"island" + 0.006*"we" + 0.006*"home" + 0.006*"t" + 0.005*"famili" + 0.005*"polic" + 0.005*"a" + 0.005*"hous"'),
 (4,
  '0.024*"storm" + 0.022*"flood" + 0.017*"hurrican" + 0.016*"rain" + 0.012*"weather" + 0.012*"wind" + 0.011*"warn" + 0.010*"dorian" + 0.007*"south" + 0.007*"expect"'),
 (5,
  '0.021*"flood" + 0.014*"district" + 0.013*"rain" + 0.011*"state" + 0.011*"water" + 0.009*"heavi" + 0.008*"india" + 0.007*"offici" + 0.007*"affect" + 

## Preprocessing/Prediction

In [58]:
#input : for lack of a better assumption, let's assume that the input will be a dataframe that has one article per row,
#        and a feature named "body" of it's unprocessed body text as a string.
#        this could include title text as well, but didn't want to put too many assumptions on the input
#        till i look into heroku

#output: a dataframe with three columns: body text, token list, corpus (where the corpus is the token ids), 
##and predicted category

def process_body(dataframe):
    text_body=dataframe['body'].values
    text_body=[remove_stopwords(body) for body in text_body]
    text_body=[tokenize(body, deacc="True", lowercase="True") for body in text_body]
    text_body=[[snow.stem(token) for token in word_list] for word_list in text_body]
    body_df=dataframe[['body']]
    body_df['tokens']=[list(gen) for gen in text_body]
    body_df['corpus']=[dictionary.doc2bow(doc) for doc in body_df['tokens']]
    body_df['predicted_topic']= [probs_to_topic(topic_probs)for topic_probs in trained_model.get_document_topics(disaster_body['corpus'])]
    return body_df