In [42]:
import json, time, re, nltk, hdbscan, spacy, string
import psycopg2 as pg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sys import getsizeof
from datetime import datetime
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from psycopg2.extras import RealDictCursor, Json
from spacy.lang.en.examples import sentences

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

np.random.seed(42)

%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [2]:
def filename_format_log(file_path, 
                        logfile = 'assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[a-z]+_[a-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Word list saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [4]:
!ls 'assets/'

1547416339_crisislex_df.csv    file_log.txt
1547416339_crisislex_tfidf.csv


Reading in the csv to format as a dataframe.

In [5]:
df = pd.read_csv('assets/1547416339_crisislex_df.csv', low_memory=False)

In [11]:
df.head()

Unnamed: 0,tweet id,tweet,label,type,processed,clean_processed,lemm_clean_processed
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane,i've got enough candles to supply a mexican fa...,"['i', 've', 'got', 'enough', 'candles', 'to', ...",i ve got enough candle to supply a mexican family
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane,sandy be soooo mad that she be shattering our ...,"['sandy', 'be', 'soooo', 'mad', 'that', 'she',...",sandy be soooo mad that she be shattering our ...
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane,ibexgirl thankfully hurricane waugh played it ...,"['ibexgirl', 'thankfully', 'hurricane', 'waugh...",ibexgirl thankfully hurricane waugh played it ...
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane,taos you never got that magnificent case of bu...,"['taos', 'you', 'never', 'got', 'that', 'magni...",tao you never got that magnificent case of bur...
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane,"i'm at mad river bar &amp; grille (new york, n...","['i', 'm', 'at', 'mad', 'river', 'bar', 'amp',...",i m at mad river bar amp grille new york ny URL


In [34]:
df.isna().sum()

tweet id                0
tweet                   1
label                   2
type                    2
processed               2
clean_processed         2
lemm_clean_processed    3
dtype: int64

In [37]:
df = df.dropna(axis = 0, inplace = False)

In [24]:
len(df.iloc[0]['lemm_clean_processed'])

49

In [39]:
def add_spacy_data(dataset, feature_column):
    '''
    Grabs the verb, adverb, noun, and stop word Parts of Speech (POS) 
    tokens and pushes them into a new dataset. returns an 
    enriched dataset.
    
    Parameters:
    
    dataset (dataframe): the dataframe to parse
    feature_column (string): the column to parse in the dataset.
    
    Returns: 
    dataframe
    '''
    
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_md')
    ##
    for i in range (len(dataset)):
        if i % 1000 == 0:
            print(f"Extracting verbs and topics from record {i+1} of {len(dataset)}")
        
        tweet = dataset.iloc[i][feature_column]
        doc = nlp(tweet)
        spacy_dataframe = pd.DataFrame()
        
        for token in doc:
            if token.lemma_ == "-PRON-":
                    lemma = token.text
            else:
                lemma = token.lemma_
            row = {
                "Word": token.text,
                "Lemma": lemma,
                "PoS": token.pos_,
                "Stop Word": token.is_stop
            }
            spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
        verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
        nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
        adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
        corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
        corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   
        corpus.append(corpus_clean)
    dataset['Verbs'] = verbs
    dataset['Nouns'] = nouns
    dataset['Adverbs'] = adverbs
    dataset['Corpus'] = corpus
    return dataset

def prep_corpus(raw_string):
    '''Single use of add_spacy_data to enable pipelining 
    data into predictions
    
    Parameters:
    raw_string (string): String to be parsed
    
    Returns:
    parsed string
    '''

    verbs = []
    nouns = []
    adverbs = []
    corpus = []
    nlp = spacy.load('en_core_web_sm')

    doc = nlp(raw_string)
    spacy_dataframe = pd.DataFrame()
    for token in doc:
        if token.lemma_ == "-PRON-":
                lemma = token.text
        else:
            lemma = token.lemma_
        row = {
            "Word": token.text,
            "Lemma": lemma,
            "PoS": token.pos_,
            "Stop Word": token.is_stop
        }
        spacy_dataframe = spacy_dataframe.append(row, ignore_index = True)
    verbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "VERB"].values))
    nouns.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "NOUN"].values))
    adverbs.append(" ".join(spacy_dataframe["Lemma"][spacy_dataframe["PoS"] == "ADV"].values))
    corpus_clean = " ".join(spacy_dataframe["Lemma"][spacy_dataframe["Stop Word"] == False].values)
    corpus_clean = re.sub(r'[^A-Za-z0-9]+', ' ', corpus_clean)   

    return corpus_clean

In [66]:
def map_popular_terms(dataset, feature_column, disaster_column):
    '''Function that counts the frequency of occurences of words in a dataset
    column. Returns a new dataset with those frequencies'''
    frequencies = pd.DataFrame()
    disaster = dataset[disaster_column].unique().tolist()
    for i in range (len(disaster)):
        disaster_corpus = str(dataset[feature_column][dataset[disaster_column] == disaster[i]].tolist())
        tokens = disaster_corpus.split(" ")
        counts = Counter(tokens)
        frequencies = frequencies.append({
            "disaster": disaster[i],
            "Most Common Terms": counts.most_common(n=100)
        }, ignore_index=True)
#     frequencies['disaster'] = frequencies['Most Common Terms'].astype(int)
    return frequencies

In [15]:
def map_common_words(dataset):
    '''Maps common words from across multiple columns in a dataset to 
    identify terms that show up in all columns. Normally used with the 
    outputs of map_popular_terms. returns the common words'''
    common_words = []
    for words in dataset['Most Common Terms'][0]:
        common_words.append(words[0])

    for i in range (0, len(dataset)):
        check_list = []
        disaster_list = dataset['Most Common Terms'][i]
        for words in disaster_list:
            check_list.append(words[0])
        common_words = [x for x in common_words if x  in check_list]
    return common_words

In [16]:
def get_common_frequency(term_list, frequency_list):
    '''Finds the frequency of occurence of terms in a list and then
    returns them in a new dataframe organized by year'''
    common_word_frequency_per_disaster = pd.DataFrame()
    for i in range(0, len(term_list)):
        word_frequency = []
        for j in range(0, len(frequency_list)):
            current_disaster = frequency_list['disaster'][j]
            current_disaster_terms = frequency_list['Most Common Terms'][j]
            for words in current_disaster_terms:
                    if term_list[i] in words[0]:
                        word_frequency.append(words[1])
                        #print(words[1])
                        break
        current_word = term_list[i]
        common_word_frequency_per_disaster[str(current_word)] = word_frequency
    common_word_frequency_per_disaster["disaster"] = np.arange(1970,2019)
    common_word_frequency_per_disaster = common_word_frequency_per_disaster.set_index("v")
    return common_word_frequency_per_disaster

In [None]:
add_spacy_data(df, 'lemm_clean_processed')

In [40]:
df.head()

Unnamed: 0,tweet id,tweet,label,type,processed,clean_processed,lemm_clean_processed,Verbs,Nouns,Adverbs,Corpus
0,'262596552399396864',I've got enough candles to supply a Mexican fa...,0,hurricane,i've got enough candles to supply a mexican fa...,"['i', 've', 'got', 'enough', 'candles', 'to', ...",i ve got enough candle to supply a mexican family,ve get supply,candle family,,i ve get enough candle to supply a mexican family
1,'263044104500420609',Sandy be soooo mad that she be shattering our ...,1,hurricane,sandy be soooo mad that she be shattering our ...,"['sandy', 'be', 'soooo', 'mad', 'that', 'she',...",sandy be soooo mad that she be shattering our ...,be be shatter,door hurricanesandy,soooo,sandy be soooo mad that she be shatter our doo...
2,'263309629973491712',@ibexgirl thankfully Hurricane Waugh played it...,0,hurricane,ibexgirl thankfully hurricane waugh played it ...,"['ibexgirl', 'thankfully', 'hurricane', 'waugh...",ibexgirl thankfully hurricane waugh played it ...,play wait go,ibexgirl hurricane waugh one moment tho,thankfully,ibexgirl thankfully hurricane waugh play it co...
3,'263422851133079552',@taos you never got that magnificent case of B...,0,hurricane,taos you never got that magnificent case of bu...,"['taos', 'you', 'never', 'got', 'that', 'magni...",tao you never got that magnificent case of bur...,get send thank,case tweet,never,tao you never get that magnificent case of bur...
4,'262404311223504896',"I'm at Mad River Bar &amp; Grille (New York, N...",0,hurricane,"i'm at mad river bar &amp; grille (new york, n...","['i', 'm', 'at', 'mad', 'river', 'bar', 'amp',...",i m at mad river bar amp grille new york ny URL,m grille,river bar amp york ny url,,i m at mad river bar amp grille new york ny url


In [67]:
map_popular_terms(df, 'Corpus', 'type')

Unnamed: 0,Most Common Terms,disaster
0,"[(hurricane, 4838), (be, 3988), (the, 3851), (...",hurricane
1,"[(the, 8852), (to, 7542), (flood, 7231), (be, ...",flood
2,"[(to, 4190), (the, 4180), (be, 3199), (oklahom...",tornado


In [62]:
frequencies.head()