In [1]:
import nltk
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Load spacy model
nlp = spacy.load('en_core_web_sm')

# Example text data
texts = [
    "I aspire to become a doctor because I know that not only can I change my life, families, but also the people around me.", 
    "I am here to get extra help in passing the course and the requirements I need to get into optometry school or PA school."
]

# Initialize TF-IDF Vectorizer for unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # (1, 3) for unigrams, bigrams, and trigrams
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to extract features
def extract_features(texts):
    features = []
    for text in texts:
        doc = nlp(text)
        
        # Text length features
        char_count = len(text)
        word_count = len(text.split())
        avg_word_length = sum(len(word) for word in text.split()) / word_count if word_count > 0 else 0
        
        # POS tag counts
        pos_counts = doc.count_by(spacy.attrs.POS)
        
        noun_count = pos_counts.get(nlp.vocab.strings['NOUN'], 0)
        verb_count = pos_counts.get(nlp.vocab.strings['VERB'], 0)
        adj_count = pos_counts.get(nlp.vocab.strings['ADJ'], 0)
        adv_count = pos_counts.get(nlp.vocab.strings['ADV'], 0)
        
        # Sentiment analysis
        sentiment = sia.polarity_scores(text)
        
        # Compile features into a dictionary
        feature_dict = {
            'char_count': char_count,
            'word_count': word_count,
            'avg_word_length': avg_word_length,
            'noun_count': noun_count,
            'verb_count': verb_count,
            'adj_count': adj_count,
            'adv_count': adv_count,
            'sentiment_neg': sentiment['neg'],
            'sentiment_neu': sentiment['neu'],
            'sentiment_pos': sentiment['pos'],
            'sentiment_compound': sentiment['compound']
        }
        
        features.append(feature_dict)
    
    return pd.DataFrame(features)

# Extract features
feature_df = extract_features(texts)
print(feature_df)

# Optional: Convert the TF-IDF matrix to a DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Features:\n", tfidf_df)


   char_count  word_count  avg_word_length  noun_count  verb_count  adj_count  \
0         119          24         4.000000           4           4          0   
1         120          24         4.041667           6           4          1   

   adv_count  sentiment_neg  sentiment_neu  sentiment_pos  sentiment_compound  
0          2            0.0          1.000          0.000              0.0000  
1          1            0.0          0.886          0.114              0.4019  
TF-IDF Features:
        also  also the  also the people        am   am here  am here to  \
0  0.133616  0.133616         0.133616  0.000000  0.000000    0.000000   
1  0.000000  0.000000         0.000000  0.120342  0.120342    0.120342   

        and   and the  and the requirements    around  ...  the people  \
0  0.000000  0.000000              0.000000  0.133616  ...    0.133616   
1  0.120342  0.120342              0.120342  0.000000  ...    0.000000   

   the people around  the requirements  the requirem

In [3]:
feature_df

Unnamed: 0,char_count,word_count,avg_word_length,noun_count,verb_count,adj_count,adv_count,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
0,119,24,4.0,4,4,0,2,0.0,1.0,0.0,0.0
1,120,24,4.041667,6,4,1,1,0.0,0.886,0.114,0.4019


In [5]:
merged_aspirational_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
# training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.2, random_state=18, stratify=merged_aspirational_df['label'])
# training_df.reset_index(drop=True, inplace=True)
# test_df.reset_index(drop=True, inplace=True)

In [7]:
text = merged_aspirational_df["sentence"]

In [8]:
features_df = extract_features(text)
features_df

Unnamed: 0,char_count,word_count,avg_word_length,noun_count,verb_count,adj_count,adv_count,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
0,14,4,2.750000,0,0,0,1,0.000,1.000,0.000,0.0000
1,47,8,5.000000,1,1,1,0,0.000,0.741,0.259,0.2732
2,59,13,3.615385,2,1,2,2,0.000,0.643,0.357,0.7269
3,104,22,3.772727,1,3,1,4,0.000,1.000,0.000,0.0000
4,101,21,3.857143,4,3,2,2,0.000,0.870,0.130,0.4019
...,...,...,...,...,...,...,...,...,...,...,...
4699,62,11,4.727273,1,2,2,0,0.000,0.496,0.504,0.7269
4700,131,26,4.076923,5,6,0,2,0.166,0.644,0.190,-0.0772
4701,58,10,4.900000,1,2,2,1,0.000,0.778,0.222,0.3134
4702,58,12,3.916667,1,4,0,0,0.000,1.000,0.000,0.0000


In [9]:
# Initialize TF-IDF Vectorizer for unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # (1, 3) for unigrams, bigrams, and trigrams
tfidf_matrix = tfidf_vectorizer.fit_transform(text)

In [12]:
# Optional: Convert the TF-IDF matrix to a DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,0220,0220 however,0220 however for,100,100 get,100 get everything,100 on,100 on every,101,101 is,...,zoology once,zoology once graduate,zoology to,zoology to go,zoom,zoom out,zoom out in,zynga,zynga activision,zynga activision or
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
merged_aspirational_df

Unnamed: 0,sentence,label,phrase
0,why am i here?,0,"[""Ever since I was little I wanted to be a doc..."
1,well why does anyone pursue a higher education?,0,"[""Ever since I was little I wanted to be a doc..."
2,to better one self and be able to succeed late...,0,"[""Ever since I was little I wanted to be a doc..."
3,ever since i was little i wanted to be a docto...,1,"[""Ever since I was little I wanted to be a doc..."
4,i always wanted to be able to help people and ...,0,"[""Ever since I was little I wanted to be a doc..."
...,...,...,...
4699,"i want to better myself, improve myself in eve...",0,['I want to be able to apply my knowledge to m...
4700,i have no problems applying myself when it com...,0,['I want to be able to apply my knowledge to m...
4701,grow mentally so that i can apply more physica...,0,['I want to be able to apply my knowledge to m...
4702,use what i learn and apply it in whatever care...,0,['I want to be able to apply my knowledge to m...


In [14]:
new_feature_engineered_dataset = pd.concat([merged_aspirational_df, feature_df, tfidf_df])

In [15]:
new_feature_engineered_dataset.to_csv("new_feature_engineered_dataset.csv", index=False)

KeyboardInterrupt: 