In [1]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_features(sentence):
    doc = nlp(sentence)
    features = {
        "dependency_paths": [],
        "sentence_frames": [],
        "pos_tags": [],
        "entities": [],
    }
    
    # Dependency Paths
    for token in doc:
        if token.dep_ in ("poss", "nsubj", "dobj", "pobj"):  # Example dependencies
            features["dependency_paths"].append(f"{token.dep_}({token.head.text}, {token.text})")
    
    # Sentence Frames and POS Tags
    sentence_frame = []
    pos_pattern = []
    for token in doc:
        if token.dep_ == "poss":
            sentence_frame.append("[Family Term]'s")
        elif token.dep_ in ("amod", "acomp", "attr"):
            sentence_frame.append("[Adjective/Noun]")
        pos_pattern.append(f"{token.text} ({token.pos_})")
    
    features["sentence_frames"].append(" ".join(sentence_frame))
    features["pos_tags"] = pos_pattern
    
    # Named Entities
    for ent in doc.ents:
        if ent.label_ in ("PERSON", "NORP", "ORG", "GPE", "LOC"):
            features["entities"].append(f"{ent.text} ({ent.label_})")
    
    return features

# Example usage
sentence = "My brother's constant encouragement."
features = extract_features(sentence)
print(features)

{'dependency_paths': ['poss(brother, My)', 'poss(encouragement, brother)'], 'sentence_frames': ["[Family Term]'s [Family Term]'s [Adjective/Noun]"], 'pos_tags': ['My (PRON)', 'brother (NOUN)', "'s (PART)", 'constant (ADJ)', 'encouragement (NOUN)', '. (PUNCT)'], 'entities': []}


In [2]:
features

{'dependency_paths': ['poss(brother, My)', 'poss(encouragement, brother)'],
 'sentence_frames': ["[Family Term]'s [Family Term]'s [Adjective/Noun]"],
 'pos_tags': ['My (PRON)',
  'brother (NOUN)',
  "'s (PART)",
  'constant (ADJ)',
  'encouragement (NOUN)',
  '. (PUNCT)'],
 'entities': []}

In [3]:
sentence = "I'm here because I want to help my family, as they have supported me."
extract_features(sentence)

{'dependency_paths': ["nsubj('m, I)",
  'nsubj(want, I)',
  'poss(family, my)',
  'dobj(help, family)',
  'nsubj(supported, they)',
  'dobj(supported, me)'],
 'sentence_frames': ["[Family Term]'s"],
 'pos_tags': ['I (PRON)',
  "'m (AUX)",
  'here (ADV)',
  'because (SCONJ)',
  'I (PRON)',
  'want (VERB)',
  'to (PART)',
  'help (VERB)',
  'my (PRON)',
  'family (NOUN)',
  ', (PUNCT)',
  'as (SCONJ)',
  'they (PRON)',
  'have (AUX)',
  'supported (VERB)',
  'me (PRON)',
  '. (PUNCT)'],
 'entities': []}

In [6]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_features(sentence):
    doc = nlp(sentence)
    features = {
        "dependency_paths": [],
        "sentence_frames": [],
        "pos_tags": [],
        "entities": [],
    }
    
    # Dependency Paths
    for token in doc:
        if token.dep_ in ("poss", "nsubj", "dobj", "pobj"):  # Example dependencies
            features["dependency_paths"].append(f"{token.dep_}({token.head.text}, {token.text})")
    
    # Sentence Frames and POS Tags
    sentence_frame = []
    pos_pattern = []
    for token in doc:
        if token.dep_ == "poss":
            sentence_frame.append("[Family Term]'s")
        elif token.dep_ in ("amod", "acomp", "attr"):
            sentence_frame.append("[Adjective/Noun]")
        pos_pattern.append(f"{token.text} ({token.pos_})")
    
    features["sentence_frames"].append(" ".join(sentence_frame))
    features["pos_tags"] = pos_pattern
    
    # Named Entities
    for ent in doc.ents:
        if ent.label_ in ("PERSON", "NORP", "ORG", "GPE", "LOC"):
            features["entities"].append(f"{ent.text} ({ent.label_})")
    
    return features

familial_dataset = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Familial_sentence_level_batch_1_jaccard.csv")
sentences = familial_dataset["sentence"]
y = familial_dataset["label"]

# Extract features for all sentences
data = [extract_features(sentence) for sentence in sentences]
df = pd.DataFrame(data)

# Flattening lists in the dataframe for vectorization
df['dependency_paths_str'] = df['dependency_paths'].apply(lambda x: ' '.join(x))
df['sentence_frames_str'] = df['sentence_frames'].apply(lambda x: ' '.join(x))
df['pos_tags_str'] = df['pos_tags'].apply(lambda x: ' '.join(x))
df['entities_str'] = df['entities'].apply(lambda x: ' '.join(x))

# Vectorizing text features
vectorizer = CountVectorizer()
X_dependency_paths = vectorizer.fit_transform(df['dependency_paths_str'])
X_sentence_frames = vectorizer.fit_transform(df['sentence_frames_str'])
X_pos_tags = vectorizer.fit_transform(df['pos_tags_str'])
X_entities = vectorizer.fit_transform(df['entities_str'])

# Concatenate all features into a single feature matrix
X = hstack([X_dependency_paths, X_sentence_frames, X_pos_tags, X_entities])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       263
           1       0.56      0.20      0.29        25

    accuracy                           0.92       288
   macro avg       0.74      0.59      0.62       288
weighted avg       0.90      0.92      0.90       288



In [7]:
df

Unnamed: 0,dependency_paths,sentence_frames,pos_tags,entities,dependency_paths_str,sentence_frames_str,pos_tags_str,entities_str
0,"[nsubj('m, i), poss(parents, my), nsubj(came, ...",[[Family Term]'s [Adjective/Noun]],"[i (PRON), 'm (AUX), here (ADV), because (SCON...",[],"nsubj('m, i) poss(parents, my) nsubj(came, par...",[Family Term]'s [Adjective/Noun],i (PRON) 'm (AUX) here (ADV) because (SCONJ) m...,
1,"[nsubj(mean, i), poss(mother, my), nsubj(been,...",[[Family Term]'s [Adjective/Noun]],"[i (PRON), mean (VERB), my (PRON), mother (NOU...",[],"nsubj(mean, i) poss(mother, my) nsubj(been, mo...",[Family Term]'s [Adjective/Noun],i (PRON) mean (VERB) my (PRON) mother (NOUN) w...,
2,"[nsubj(made, she), dobj(made, lot), pobj(of, s...",[],"[she (PRON), made (VERB), a (DET), lot (NOUN),...",[],"nsubj(made, she) dobj(made, lot) pobj(of, sacr...",,she (PRON) made (VERB) a (DET) lot (NOUN) of (...,
3,"[nsubj(guess, i), nsubj(say, you), dobj(say, s...",[[Family Term]'s],"[i (PRON), guess (VERB), you (PRON), can (AUX)...",[],"nsubj(guess, i) nsubj(say, you) dobj(say, same...",[Family Term]'s,i (PRON) guess (VERB) you (PRON) can (AUX) say...,
4,"[nsubj(am, i), nsubj(getting, formula), dobj(g...",[[Adjective/Noun] [Adjective/Noun]],"[i (PRON), am (AUX), also (ADV), here (ADV), b...",[],"nsubj(am, i) nsubj(getting, formula) dobj(gett...",[Adjective/Noun] [Adjective/Noun],i (PRON) am (AUX) also (ADV) here (ADV) becaus...,
...,...,...,...,...,...,...,...,...
1435,"[nsubj(am, i)]",[],"[i (PRON), am (AUX), here (ADV), to (PART), le...",[],"nsubj(am, i)",,i (PRON) am (AUX) here (ADV) to (PART) learn (...,
1436,"[nsubj(am, i), pobj(to, college), poss(family,...",[[Adjective/Noun] [Adjective/Noun] [Family Ter...,"[since (SCONJ), i (PRON), am (AUX), the (DET),...",[],"nsubj(am, i) pobj(to, college) poss(family, my...",[Adjective/Noun] [Adjective/Noun] [Family Term...,since (SCONJ) i (PRON) am (AUX) the (DET) firs...,
1437,"[pobj(because, that), nsubj(have, i), dobj(mak...",[[Family Term]'s [Adjective/Noun] [Adjective/N...,"[because (SCONJ), of (ADP), that (PRON), i (PR...",[],"pobj(because, that) nsubj(have, i) dobj(make, ...",[Family Term]'s [Adjective/Noun] [Adjective/Noun],because (SCONJ) of (ADP) that (PRON) i (PRON) ...,
1438,"[nsubj(told, she), dobj(told, me), nsubj(had, ...",[[Adjective/Noun]],"[she (PRON), always (ADV), told (VERB), me (PR...",[],"nsubj(told, she) dobj(told, me) nsubj(had, you...",[Adjective/Noun],she (PRON) always (ADV) told (VERB) me (PRON) ...,
