In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from IPython.display import display 

import spacy

In [None]:
folder = os.path.join(os.getcwd(), "data", "transcripts")
files = [os.path.join(root, f) for root, _, files in os.walk(folder) for f in files if f.endswith(".docx")]
        

In [None]:
from src.pipelinelib.querying import Parser, Queryable

nlp = spacy.load("de_core_news_sm", disable=["ner", "parser"])
parser = Parser(nlp=nlp, metadata_path="./data/transcripts/Kopie von Transkriptionspaare_Daten.xls")

parser.read_from_files(files)

In [None]:
display(parser.frame)

In [None]:
from src.pipelinelib.querying import Parser, Queryable
from src.pipelinelib.text_body import TextBody

queryable = Queryable.from_parser(parser)

In [None]:
document_df = queryable.execute(level=TextBody.DOCUMENT)
display(document_df)

In [None]:
paragraph_df = queryable.execute(level=TextBody.PARAGRAPH)
display(paragraph_df)

In [None]:
sentence_df = queryable.execute(level=TextBody.SENTENCE)
display(sentence_df)

queryable.execute(level=TextBody.SENTENCE)

In [None]:
pgs = queryable.by_couple_id(couple_id=27) \
    .is_depressed(d=False) \
    .execute(level=TextBody.DOCUMENT)
display(pgs)

In [None]:
from src.sigmund.preprocessing.words import Tokenizer, Stemmer, Lemmatizer
from src.sigmund.features.tfidf import FeatureTFIDF
from src.pipelinelib.pipeline import Pipeline
from src.sigmund.classification.naive_bayes import NaiveBayes
from src.sigmund.classification.merger import FeatureMerger
from src.sigmund.classification.linear_discriminant_analysis import LinearDiscriminantAnalysisClassifier
from src.sigmund.features.liwc import Liwc
from src.sigmund.features.vocabulary_size import VocabularySize
from src.sigmund.classification.pca import PCAReduction
from src.sigmund.extensions import *
from src.sigmund.features.pos import PartOfSpeech
from src.sigmund.features.basic_statistics import BasicStatistics

pipeline = Pipeline(queryable=queryable)
pipeline.add_components([Tokenizer(), Stemmer(), Lemmatizer()])
pipeline.add_component(FeatureTFIDF(white_list=[
    'ja', 'auch', 'wenn', 'also', 'werden', 'schon', 'wir', # high in depressed group
    'und', 'haben', 'du', 'sehr'])), #'so', 'wirkl  ich', 'ich', 'gerne', 'weil']))
pipeline.add_component(NaiveBayes(inputs=[TFIDF_DOCUMENT_MF], output=CLASSIFICATION_NAIVE_BAYES_TFIDF, voting=False))

pipeline.add_component(Liwc(white_list=[
    'Posemo', 'Past', 'Present', 'Future', 'Metaph',
    'Death', 'Affect', 'Incl', 'Achieve'
]))
pipeline.add_component(NaiveBayes(inputs=[LIWC_DOCUMENT_MF], output=CLASSIFICATION_NAIVE_BAYES_LIWC, voting=False))

pipeline.add_component(PartOfSpeech(white_list=["ADV", "PPER", "ADJD", "VAFIN", "KON"]))
pipeline.add_component(NaiveBayes(inputs=[POS_DOCUMENT_MF], output=CLASSIFICATION_NAIVE_BAYES_POS, voting=False))

pipeline.add_component(NaiveBayes(inputs=[
    CLASSIFICATION_NAIVE_BAYES_TFIDF, 
    CLASSIFICATION_NAIVE_BAYES_LIWC, 
    CLASSIFICATION_NAIVE_BAYES_POS,
], output=CLASSIFICATION_NAIVE_BAYES_VOTING, voting=True))

storage = pipeline.execute(visualise=True)