In [61]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 18.2 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [62]:
from afinn import Afinn
import pandas as pd
import re
from labMTsimple.storyLab import *
import nltk
from nltk.tree import Tree
from nltk.tree import ParentedTree
from nltk.tokenize import sent_tokenize
import numpy as np
import spacy

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('large_grammars')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckhoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckhoe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package large_grammars to
[nltk_data]     C:\Users\ckhoe\AppData\Roaming\nltk_data...
[nltk_data]   Package large_grammars is already up-to-date!


True

## Sentiment

In [63]:
reddit_data = reddit_data.rename(columns={'User':'user', 'Post':'post', 'Label':'label'})

In [64]:
afn = Afinn()

def get_sentiment(text):
    score = afn.score(text)
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

In [65]:
reddit_data['afinn_score'] = reddit_data['post'].apply(afn.score)
reddit_data['sentiment'] = reddit_data['post'].apply(get_sentiment)

## Personal Pronoun Count and Ratio

In [66]:
def get_pronoun_count(text):
    pronouns = re.findall(r'\b(I|me|mine|myself|we|us|ours|ourselves)\b', text, re.IGNORECASE)
    return len(pronouns)

reddit_data['personal_pronoun_count'] = reddit_data['post'].apply(get_pronoun_count)
reddit_data['personal_pronoun_itra_ratio'] = reddit_data['personal_pronoun_count'] / reddit_data['personal_pronoun_count'].sum()

## Number of Sentences

In [67]:
def count_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

reddit_data['num_sentences'] = reddit_data['post'].apply(count_sentences)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ckhoe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckhoe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Definite Articles

In [68]:
def count_articles(text):
    articles = re.findall(r'\b(the)\b', text, re.IGNORECASE)
    return len(articles)

reddit_data["num_articles"] = reddit_data['post'].apply(count_articles)

## Pronouns

In [69]:
def count_pronouns(text):
    sentences = nltk.word_tokenize(text)
    tags = nltk.pos_tag(sentences)
    pronouns = ['PRP', 'PRP$', 'WP', 'WP$']
    
    num_pronouns = sum([1 for word, tag in tags if tag in pronouns])
    
    return num_pronouns

reddit_data["num_pronouns"] = reddit_data['post'].apply(count_pronouns)

## Tree height 

In [70]:
snlp = spacy.load("en_core_web_sm")

def tree_height(node):
    if not list(node.children):
        return 1
    else:
        return 1 + max(tree_height(child) for child in node.children)

def get_tree_height(text):
    
    doc = snlp(text)
    heights = [tree_height(sent.root) for sent in doc.sents]
    
    if not heights:
        return None, None
    
    return np.mean(heights), max(heights)



In [71]:
reddit_data["mean_tree_height"], reddit_data["max_tree_height"] = zip(*reddit_data['post'].apply(get_tree_height))

## LabMT

In [72]:
def get_strict_match(text):
    lang = 'english'
    labMT, labMTvector, labMTwordList = emotionFileReader(stopval=0.0,lang=lang, returnVector=True)
    
    textValence, textFvec = emotion(text, labMT, shift=True, happsList=labMTvector)
    textStoppedVec = stopper(textFvec, labMTvector, labMTwordList, stopVal=1.0)
    textValence = emotionV(textStoppedVec, labMTvector)
    return textValence

reddit_data["labmt_valence"] = reddit_data['post'].apply(get_strict_match)

## Verbal Phrase Length

In [73]:
def max_vp_length(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    max_length = 0
    current_length = 0
    inside_vp = False
    
    for token in doc:
        if token.pos_ == "VERB":
            if not inside_vp:
                inside_vp = True
            current_length += 1
        elif inside_vp:
            if token.pos_ in {"NOUN", "PRON", "ADJ", "ADV", "ADP", "DET", "NUM"}:
                current_length += 1
            else:
                max_length = max(max_length, current_length)
                inside_vp = False
                current_length = 0
                
    max_length = max(max_length, current_length)
    
    return max_length
      
reddit_data["max_verb_phrase_length"] = reddit_data['post'].apply(max_vp_length)

## Normalization

In [74]:
reddit_data_enriched = pd.DataFrame()
reddit_data_enriched['user'] = reddit_data['user']
reddit_data_enriched['post'] = reddit_data['post']
reddit_data_enriched['label'] = reddit_data['label']

In [75]:
def normalize(series):
    return (series - series.mean()) / series.std()

In [76]:
reddit_data_enriched['afinn_score'] = normalize(reddit_data['afinn_score'])
reddit_data_enriched['sentiment'] = reddit_data['sentiment']
reddit_data_enriched = pd.get_dummies(reddit_data_enriched, columns=['sentiment'])
reddit_data_enriched['personal_pronoun_count'] = normalize(reddit_data['personal_pronoun_count'])
reddit_data_enriched['num_sentences'] = normalize(reddit_data['num_sentences'])
reddit_data_enriched['num_articles'] = normalize(reddit_data['num_articles'])
reddit_data_enriched['num_pronouns'] = normalize(reddit_data['num_pronouns'])
reddit_data_enriched['mean_tree_height'] = normalize(reddit_data['mean_tree_height'])
reddit_data_enriched['max_tree_height'] = normalize(reddit_data['max_tree_height'])
reddit_data_enriched['labmt_valence'] = normalize(reddit_data['labmt_valence'])
reddit_data_enriched['max_verb_phrase_length'] = normalize(reddit_data['max_verb_phrase_length'])



In [77]:
reddit_data_enriched.to_csv('reddit_data.csv', index=False)