In [None]:
"""
@author Chase Fensore
BMI 550: Assignment 2
Inspiration from: https://www.kaggle.com/code/shivamb/extensive-text-data-feature-engineering
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding

from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

import datetime as dt
import pandas as pd
import numpy as np
import warnings
import string

import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.stem.porter import *
from nltk.corpus import stopwords
stemmer = PorterStemmer()

# stop_words = []
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

In [None]:
# Helper functions
# functions to get polatiy and subjectivity of text using the module textblob
def get_polarity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [None]:
# Read train, test data in
train = pd.read_csv("fallreports_2023-9-21_train.csv")
test = pd.read_csv("fallreports_2023-9-21_test.csv")

# Process each train, test INDEPENDENTLY to avoid data leakage.

# Cast fall description to String.
train['fall_description'] = train['fall_description'].astype(str)
test['fall_description'] = test['fall_description'].astype(str)

In [None]:
"""
Make simple text features:

# Feature 28: Word Density of the Complete Essay - average length of the words used in the essay
# Feature 29: Puncutation Count in the Complete Essay - total number of punctuation marks in the essay
# Feature 30: Upper Case Count in the Complete Essay - total number of upper count words in the essay
# Feature 32: Stopword Count in the Complete Essay - total number of stopwords in the essay
"""

# TRAINING DATA
# # extract features from text
train['char_count'] = train['fall_description'].apply(len)
train['word_count'] = train['fall_description'].apply(lambda x: len(x.split()))
train['word_density'] = train['char_count'] / (train['word_count']+1)
train['punctuation_count'] = train['fall_description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
train['title_word_count'] = train['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
train['upper_case_word_count'] = train['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
train['stopword_count'] = train['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))


# TEST DATA
# # extract features from text
test['char_count'] = test['fall_description'].apply(len)
test['word_count'] = test['fall_description'].apply(lambda x: len(x.split()))
test['word_density'] = test['char_count'] / (test['word_count']+1)
test['punctuation_count'] = test['fall_description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
test['title_word_count'] = test['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
test['upper_case_word_count'] = test['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
test['stopword_count'] = test['fall_description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))


# Check results
#test[['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count', 'stopword_count']].head(10)

In [None]:
""" 
Make advanced text features:

# ADVANCED TEXT FEATURES:
# Feature 33: Article Polarity - total number of characters in essay 1 including spaces
# Feature 34: Article Subjectivity - total number of characters in essay 2 including spaces
# Number of POSs
#   Feature 35: Noun Count - total number of characters in essay 3 including spaces
#   Feature 36: Verb Count - total number of characters in essay 4 including spaces
#   Feature 37: Adjective Count - total number of characters in project title including spaces
#   Feature 38: Adverb Count - total number of words in the complete essay text
#   Feature 39: Pronoun Count - total number of characters in complete essay text

# FUTURE WORK: 
# Readability measures: from https://pypi.org/project/py-readability-metrics/
# Dale Chall Readability: based on the use of familiar words, rather than syllable or letter counts. Reading tests show that readers usually find it easier to read, process and recall a passage if they find the words familiar.
# Automated Readability Index (ARI): Unlike the other indices, the ARI, along with the Coleman-Liau, relies on a factor of characters per word, instead of the usual syllables per word. ARI is widely used on all types of texts.
#   SMOG: The SMOG Readability Formula (Simple Measure of Gobbledygook) is a popular method to use on health literacy materials.
"""


# TRAINING DATA:
# change df_small to df to create these features on complete dataframe
train['polarity'] = train['fall_description'].apply(get_polarity)
train['subjectivity'] = train['fall_description'].apply(get_subjectivity)
train['noun_count'] = train['fall_description'].apply(lambda x: pos_check(x, 'noun'))
train['verb_count'] = train['fall_description'].apply(lambda x: pos_check(x, 'verb'))
train['adj_count'] = train['fall_description'].apply(lambda x: pos_check(x, 'adj'))
train['adv_count'] = train['fall_description'].apply(lambda x: pos_check(x, 'adv'))
train['pron_count'] = train['fall_description'].apply(lambda x: pos_check(x, 'pron'))

# TEST DATA:
test['polarity'] = test['fall_description'].apply(get_polarity)
test['subjectivity'] = test['fall_description'].apply(get_subjectivity)
test['noun_count'] = test['fall_description'].apply(lambda x: pos_check(x, 'noun'))
test['verb_count'] = test['fall_description'].apply(lambda x: pos_check(x, 'verb'))
test['adj_count'] = test['fall_description'].apply(lambda x: pos_check(x, 'adj'))
test['adv_count'] = test['fall_description'].apply(lambda x: pos_check(x, 'adv'))
test['pron_count'] = test['fall_description'].apply(lambda x: pos_check(x, 'pron'))





In [None]:
# I decide to drop the train example with "NA"
train

train = train[~(train['fall_description'] == "nan")]
# 3 fall reports were excluded from train set because they were "NA" / 'nan'


In [None]:
# Now, generate n-grams for TRAIN, TEST:
def preprocess_text(raw_text):
    '''
        Preprocessing function
        PROGRAMMING TIP: Always a good idea to have a *master* preprocessing function that reads in a string and returns the
        preprocessed string after applying a series of functions.
    '''
    #stemming and lowercasing (no stopword removal
    words = [stemmer.stem(w) for w in raw_text.lower().split()]
    return (" ".join(words))


# Now, we preprocess data before generating n-grams:
train['pp_fall_description'] = [preprocess_text(tr) for tr in train['fall_description']]
test['pp_fall_description'] = [preprocess_text(te) for te in test['fall_description']]


#VECTORIZE N-GRAMS: word-level 
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer="word", tokenizer=None, preprocessor=None,
                                max_features=1000)
word_training_vectors = vectorizer.fit_transform(train['pp_fall_description']).toarray()
word_test_vectors = vectorizer.transform(test['pp_fall_description']).toarray()


#VECTORIZE N-GRAMS: Character-level (character n-grams only from text inside word boundaries)
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer="char_wb", tokenizer=None, preprocessor=None,
                                max_features=1000)
char_training_vectors = vectorizer.fit_transform(train['pp_fall_description']).toarray()
char_test_vectors = vectorizer.transform(test['pp_fall_description']).toarray()

In [117]:
# Generate TF-IDF features
tfidf_trans = TfidfTransformer()
tfidf_train = tfidf_trans.fit_transform(word_training_vectors) # Fit & transform
tfidf_test = tfidf_trans.transform(word_test_vectors) # Only transform

<71x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 3698 stored elements in Compressed Sparse Row format>

In [124]:
# Check nans
# Train
#nan_train = train.isna()
# Test
#nan_test = test.isna()
#nan_counts_train = nan_train.sum()
#print(nan_counts_train)


# Read most recent feat file
train = pd.read_csv("train_folds.csv")
test = pd.read_csv("test_feats.csv")


# Identify numerical columns (Train, test same names)
numerical_columns = train.select_dtypes(include=[np.number]).columns


# TRAIN:
# Iterate through each column and fill NaN values with column median
for column in numerical_columns:
    median = train[column].median()
    train[column].fillna(median, inplace=True)

# TEST:
for column in numerical_columns:
    median = test[column].median()
    test[column].fillna(median, inplace=True)



In [None]:
nan_train = train.isna()
# Test
nan_test = test.isna()
nan_counts_train = nan_train.sum()
print(nan_counts_train)

In [116]:
# Finally, combine features, and print to CSV:

# train
train['word_training_vectors'] = pd.Series(list(word_training_vectors))
train['char_training_vectors'] = pd.Series(list(char_training_vectors))
train['tfidf_train'] = pd.Series(list(tfidf_train.toarray()))

# test
test['word_training_vectors'] = pd.Series(list(word_test_vectors))
test['char_training_vectors'] = pd.Series(list(char_test_vectors))
test['tfidf_train'] = pd.Series(list(tfidf_test.toarray()))

# To view input features
# train

test['word_training_vectors'][0].shape

(1000,)

In [127]:
# Add demo characterstic for subgroups:
train['ALL'] = 1
test['ALL'] = 1


# Includes folds
train.to_csv('train_folds.csv') # Excludes 3 samples with nan for fall description

# Finally, write to csv:
# train.to_csv('train_feats.csv') # Excludes 3 samples with nan for fall description
test.to_csv('test_feats.csv') # Does NOT exclude any nan fall descriptions. 