In [2]:
#!/usr/bin/env python3

'''
Authors: Daniel M. Low
License: Apache 2.0
'''
import string
import re
import pandas as pd
import numpy as np
import stanza
import textacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer


ModuleNotFoundError: No module named 'textacy'

In [None]:
# Do this once at the beginning so we don't reload for each post
stanza.download('en') # download English model
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')# initialize English neural pipeline




In [34]:
post = "Mayor Cuomo may lockdown NYC. I live in Brooklyn."
post = nlp(post)
print(*[f'token: {token.text}\tner: {token.ner}' for sent in post.sentences for token in sent.tokens], sep='\n')
print(post.ents)

token: Mayor	ner: O
token: Cuomo	ner: S-PERSON
token: may	ner: O
token: lockdown	ner: O
token: NYC	ner: S-GPE
token: .	ner: O
token: I	ner: O
token: live	ner: O
token: in	ner: O
token: Brooklyn	ner: S-GPE
token: .	ner: O
[{
  "text": "Cuomo",
  "type": "PERSON",
  "start_char": 6,
  "end_char": 11
}, {
  "text": "NYC",
  "type": "GPE",
  "start_char": 25,
  "end_char": 28
}, {
  "text": "Brooklyn",
  "type": "GPE",
  "start_char": 40,
  "end_char": 48
}]


In [50]:
# Extract location or other entities
def extract_entity(doc, entity = 'GPE'):   
    # classes although they have different code names like GPE and ORG: (PERSON, LOCATION, ORGANIZATION, MISC), numerical (MONEY, NUMBER, ORDINAL, PERCENT), and temporal (DATE, TIME, DURATION, SET) entities (12 classes).
    # Adding the regexner annotator and using the supplied RegexNER pattern files adds support for the fine-grained and additional entity classes EMAIL, URL, CITY, STATE_OR_PROVINCE, COUNTRY, NATIONALITY, RELIGION, (job) TITLE, IDEOLOGY, CRIMINAL_CHARGE, CAUSE_OF_DEATH, HANDLE (12 classes) for a total of 24 classes.
    
    # Extract locations (don't lowercase doc, won't work for "nyc", only "NYC")
    result = []
    doc = nlp(doc) #nlp was loaded once above
    entitities = doc.ents

    if entity == 'all':
        result = [n.text for n in entitities]
        return result
    else:
        for ent in entitities:
            if ent.type ==entity:
                # append 'NYC'                 
                result.append(ent.text)
        return result
    

In [51]:


print(extract_entity("I live in Brooklyn, NYC", entity = 'GPE'))
print(extract_entity("I live in Brooklyn, Cuomo's town", entity = 'PERSON'))
print(extract_entity("I live in Brooklyn, Cuomo's town, on April 21st, 1990", entity = 'all'))

['Brooklyn', 'NYC']
['Cuomo']
['Brooklyn', 'Cuomo', 'April 21st, 1990']


In [110]:
def count_words(doc,words=[]):
    '''
    words = ['corona','virus','coronavirus', 'pandemic', 'epidemic', 'quarantine', 'covid', 'covid19']
    '''
    text_lower = doc.lower()
    # remove punctuation except apostrophes because we need to search for things like don't want to live
    text_wo_punctuation = re.sub("[^\w\d'\s]+",'',sentence).replace('  ',' ')
    counter = 0
    for word in words:
        counter+= text_wo_punctuation.split().count(word)

    if counter>0:
        appears = 1
    else:
        appears = 0
    return counter, appears

def punctuation_count(doc):
    d = {}
    punctuation = '!"#$%&()*+,-./:;=?@[\]^`{|}~' #removed ', <>, _ because m_c_a_t doesnt reflect punctuation really
    for c in document:
        if c in punctuation:
            if c not in d:
                d[c] = 0
            d[c] += 1
    total = np.sum(list(d.values()))
    return total

def liwc(input_path, document=None):
    liwc = np.load(input_path + 'liwc.npy').item()
    categories = pd.read_csv(input_path + 'categories.txt', index_col=0)['0'].tolist()
    liwc_vector = []
    stemmer = SnowballStemmer(language='english')
    for category in categories:
        counter  = 0
        document_tokenized = [n.strip(string.punctuation).lower() for n in document.split()]
        document_stemmed = [stemmer.stem(word) for word in document_tokenized]
        # for each word in category, check if its in stemmed sentence list
        counter_doc = np.sum([sum(word.rstrip()==s for s in document_stemmed) for word in liwc.get(category)])# Make sure to remove final space from word with rstrip
        '''
        # test
        category = categories[0]
        for word in liwc.get(category):
            if word in document_stemmed:
                print(word)
        '''
        counter += counter_doc
        liwc_vector.append(counter)
    names = ['liwc_'+n for n in categories]
    return liwc_vector, names

In [None]:
def tfidf(X_train_sentences = [], X_test_sentences=[], lower_case = True, ngram_range = (1,2), max_features=512, min_df=2, max_df=0.8, model = 'vector'):
    """
    TfidfVectorizer is CountVectorizer followed by TfidfTransformer, The former converts text documents to a sparse matrix of token counts.
    This sparse matrix is then put through the TfidfTransformer which converts a count matrix to a normalized Term Frequency-Inverse Document Frquency(tf)  
    representation which is a metric of word importance. 
    We fit_transform on train_sentences and transform on test sentences to prevent overfitting, X_test_sentences can be None
    
    model: {vector, sequential} depending on what model takes as input: vector (svm, random forest), sequential (lstm)
    """
    sw = stopwords.words('english')

    if model == 'sequential':
        #         
        vectorizer = TfidfVectorizer(lowercase=lower_case, ngram_range=ngram_range, stop_words=sw,
                                     max_features=max_features, min_df=min_df, max_df=max_df, analyzer=lambda x: x)
        train_vectors = vectorizer.fit_transform(X_train_sentences).toarray()
        if X_test_sentences:
            test_vectors = vectorizer.transform(X_test_sentences).toarray()

    else:
        # model = 'vector'         
        vectorizer = TfidfVectorizer(lowercase=lower_case, ngram_range=ngram_range, stop_words=sw,
                                     max_features=max_features, min_df=min_df, max_df=max_df)
        train_vectors = vectorizer.fit_transform(X_train_sentences).toarray()
        if X_test_sentences:
            test_vectors = vectorizer.transform(X_test_sentences).toarray()
    # train_vectors = vectorizer.fit_transform(X_train_sentences.ravel()).toarray()
    # test_vectors = vectorizer.transform(X_test_sentences.ravel()).toarray()
    feature_names = vectorizer.get_feature_names()
    feature_names = ['tfidf_'+n for n in feature_names]
    if X_test_sentences:
        return train_vectors, test_vectors, feature_names
    else:
        return train_vectors, feature_names


In [None]:
def extract_NLP_features(doc, features):
    feature_vector = []
    feature_names  = []
    
    if 'sentiment' in features:
        # don't lowercase or remove punctuation, but maybe preprocess emojis     
        sid = SentimentIntensityAnalyzer()
        scores = sid.polarity_scores(doc)
        names = ['sent_neg','sent_neu', 'sent_pos', 'sent_compound']
        assert len(scores) == len(names)
        feature_vector.append(scores)
        feature_names.append(names)
        
    if 'covid19' in features:
        # todo: see if we should add more by looking through COVID19_Support to see how people are mentioning it         
        words = ['corona','virus','coronavirus', 'pandemic', 'epidemic', 'quarantine', 'covid', 'covid19']
        counter, appears = count_words(doc,words=words)
        feature_vector.append([counter, appears])
        feature_names.append(['covid19_total', 'covid19_boolean'])
        
    if 'suicidality' in features:
        # todo: should we add any more? A suicide researcher gave me some more. We should be good, but we could go through SuicideWatch
        words = ['commit suicide', 'jump off a bridge', 'I want to overdose', 'I will overdose', 'thinking about overdose', 'kill myself', 'killing myself', 'hang myself', 'hanging myself', 'cut myself', 'cutting myself', 'hurt myself', 'hurting myself', 'want to diewanna die', "don't want to wake up", "don't wake up", 'never want to wake up', "don't want to be alive", 'want to be alive anymore', 'wish it would all end', 'done with living', 'want it to end', 'it all ends tonight', 'end my life', 'live anymore', 'living anymore', 'life anymore', 'be dead', 'take it anymore', 'think about death', 'hopeless', 'hurt myself', "no one will miss medon't want to wake up", 'if I live or die', 'i hate my life', 'shoot me', 'kill me']
        counter, appears = count_words(doc,words=words)
        feature_vector.append([counter, appears])
        feature_names.append(['suicidality_total', 'suicidality_boolean'])
        
    
    if 'punctuation' in features:
        count = punctuation_count(doc)
        feature_vector.append([count])
        feature_names.append(['punctuation'])
        
    if 'liwc' in features:
        input_path = './../data/input/liwc_english_dictionary/'
        vector, names = liwc(input_path = input_path, document = doc)
        feature_vector.append(vector)
        feature_names.append(names)
        


    if 'basic_count' in features:
        # https://chartbeat-labs.github.io/textacy/build/html/getting_started/quickstart.html
        '''
        {'n_sents': 3,
         'n_words': 73,
         'n_chars': 414,
         'n_syllables': 134,
         'n_unique_words': 57,
         'n_long_words': 30,
         'n_monosyllable_words': 38,
         'n_polysyllable_words': 19}
        '''
        ts = textacy.TextStats(doc)
        '''
        todo:
        scores = ts.basic_counts.values() #or something like this
        names = ts.basic_counts.keys() #or something like this
        '''
        feature_vector.append(scores)
        feature_names.append(names)

        
    if 'readability' in features:
        '''
        {'flesch_kincaid_grade_level': 15.56027397260274,
        'flesch_reading_ease': 26.84351598173518,
        'smog_index': 17.5058628484301,
        'gunning_fog_index': 20.144292237442922,
        'coleman_liau_index': 16.32928468493151,
        'automated_readability_index': 17.448173515981736,
        'lix': 65.42922374429223,
        'gulpease_index': 44.61643835616438,
        'wiener_sachtextformel': 11.857779908675797}
        '''
        ts = textacy.TextStats(doc)
        ts.readability_stats
        '''
        todo:
        scores = ts.readability_stats.values() #or something like this
        names = ts.readability_stats.keys() #or something like this
        '''
        feature_vector.append(scores)
        feature_names.append(names)
    

        
    
    
    feature_vector = [n for i in feature_vector for n in i]
    feature_names  = [n for i in feature_names for n in i]
    return feature_vector, feature_names

    

In [None]:

features = ['sentiment', 'covid19', 'suicidality', 'punctuation', 'liwc', 'basic_count', 'readability',  'tfidf']

input_dir = './../data/input/'
files  = os.listdir(input_dir)
for file in files:
    df_subreddit = pd.read_csv(input_dir + file)
    posts = list(df_subreddit.posts)
    posts = [post.replace('\n', ' ').replace('  ',' ').replace('“', '').replace('”', '') for post in posts] #here I remove paragraph split, double spaces and some other weird stuff, this should be done once for all posts
    
    for post in posts:
        feature_vector, feature_names = extract_NLP_features(post, features)
        '''
        #     or you can do a list comprehension [extract_NLP_features(post, features) for post in posts]
        # the way i have it above, you extract feature names each time, this could be done once. 
        todo: append to a df_subreddit_features         
        '''

    if 'tfidf' in features:
        
    '''
    tfidf is done with the whole corpus, not for each sentence, or with the whole train set (if we're doing a split, which we may not be for cluster analysis). 
    
    play with the params a bit to figure out how many max_features so that it's not too sparse and captures important words per subreddit. 
    '''
    tfidf(X_train_sentences = [], X_test_sentences=[], lower_case = True, ngram_range = (1,2), max_features=512, min_df=2, max_df=0.8, model = 'svm')
        
        
        