In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from spacy.lang.en import English

In [None]:
first = pd.read_parquet("cleaned_data_first.parquet")
last = pd.read_parquet("cleaned_data_last.parquet")

first.columns

In [19]:
def get_word_matrix(df): 

    tfidf = TfidfVectorizer(
        max_features=50000,     # lab used 10000 i think but we can try 50000 for now
        stop_words="english",
        ngram_range=(1,1),      # only focus on unigrams
    )

    word_matrix = tfidf.fit_transform(df["text"])
    print("shape of word matrix: " + str(word_matrix.shape))


    # turn the multi-genre labels into binary matrix where row = document and col = label 
    
    mlb = MultiLabelBinarizer()
    genre_labels = mlb.fit_transform(first["genre"])
    print("Classes:", mlb.classes_)
    print("shape of genre labels: " + str(genre_labels.shape))

    return word_matrix, genre_labels
    

In [28]:
nlp = English(pipeline=["tok2vec", "tagger", "parser", "ner"], max_length=5000000)


In [None]:
# next: generate feature matrix 

def generate_features(text): 
    doc = nlp(text)
    num_words = len(doc)
    pos_counts = {
        "num_nouns": sum(1 for t in doc if t.pos_ == "NOUN"),
        "num_verbs": sum(1 for t in doc if t.pos_ == "VERB"),
        "num_adjs": sum(1 for t in doc if t.pos_ == "ADJ"),
        "num_advs": sum(1 for t in doc if t.pos_ == "ADV"),
    }
    tokens = [t.text.lower() for t in doc if t.is_alpha]
    avg_word_length = sum(len(w) for w in tokens) / len(tokens)
    type_token_ratio = len(set(tokens)) / len(tokens)

    features = {"num_words": num_words,
        "avg_word_length": avg_word_length,
        "type_token_ratio": type_token_ratio,}
    
    features.update(pos_counts) # since pos_counts is already a dictionary, combine them with update 
    

def generate_feature_matrix(df):
    linguistic_features = df["text"].apply(generate_feature_matrix)
    return pd.DataFrame(linguistic_features.tolist())


In [None]:
X1, Y1 = get_word_matrix(first)

X2, Y2 = get_word_matrix(last)

F1 = generate_feature_matrix(first)
F2 = generate_feature_matrix(last)


# need to combine X1 + F1 and X2 + F2