In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import os

import spacy
from IPython.display import display

from src.pipelinelib.querying import Parser, Queryable

folder = os.path.join(os.getcwd(), "data", "transcripts")
files = [os.path.join(root, f) for root, _, files in os.walk(folder)
         for f in files if f.endswith(".docx")]


nlp = spacy.load("de_core_news_sm", disable=["ner", "parser"])
parser = Parser(
    nlp=nlp, metadata_path="./data/transcripts/Kopie von Transkriptionspaare_Daten.xls")

parser.read_from_files(files)
display(parser.frame)


=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 81_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 87_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 182_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 138_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 47_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 105_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 29_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 27_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 58_T1_IM_F

Unnamed: 0,document_id,paragraph_id,sentence_id,couple_id,speaker,gender,is_depressed_group,depressed_person,text
0,0,0,0,81,A,W,False,,Ich dachte wir reden darüber wie wir die erste...
1,0,1,1,81,B,M,False,,Erste Wohnung?
2,0,2,2,81,A,W,False,,"Ja, ich dachte das ist ein nettes Thema, oder?"
3,0,3,3,81,B,M,False,,Ja.
4,0,4,4,81,A,W,False,,"Ich fand das süß, wie du erst angefangen hast ..."
...,...,...,...,...,...,...,...,...,...
131,9,94,131,60,B,M,True,W,Hab ich ein Schild auf dem Rücken?
132,9,94,132,60,B,M,True,W,Ladet eure Probleme bei mir ab.
133,9,95,133,60,A,W,True,W,Das sind keine Probleme.
134,9,96,134,60,B,M,True,W,Kümmerst du dich drum.


In [56]:
from src.pipelinelib.querying import Parser, Queryable
from src.pipelinelib.text_body import TextBody
from src.pipelinelib.pipeline import Pipeline
queryable = Queryable.from_parser(parser)

In [103]:
from src.sigmund import adapter
from src.sigmund.extensions import LEMMATIZED, STEMMED, TOKENS, TALKTURN
from src.sigmund.features import agreement_score as fagree
from src.sigmund.features import flesch_reading_ease as fflesch
from src.sigmund.features import talk_turn as ftalkturn
from src.sigmund.features import liwc_one_hot as fliwconehot
from src.sigmund.preprocessing import words as pwords

liwc_dict_path = r"/home/rise/Schreibtisch/Sigmund_git/sigmund/data/German_LIWC2001_Dictionary.dic"

pipeline = Pipeline(queryable = queryable, empty_pipeline=True) \
    .add_component(fflesch.FleschExtractor())\
    .add_component(pwords.Tokenizer())\
    .add_component(fliwconehot.LiwcOneHot())

    
    #.add_component(ftalkturn.TalkTurnExtractor()) \
    #.add_component(fagree.AgreementScoreExtractor(liwc_dict_path))




    
#    .add_component(adapter.Adapter(old = TOKENS, new = TALKTURN))\


In [104]:
test = pipeline.execute()

#for doc in preprocessed_depr:
#    print('TT',doc[0]._.talkturn, 'AS',doc[0]._.agreementscore)
#for doc in preprocessed_non_depr:
#    print('TT',doc[0]._.talkturn, 'AS',doc[0]._.agreementscore)

=== Starting pipeline with ['Tokenizer', 'LiwcOneHot', 'FleschExtractor', 'TalkTurnExtractor', 'AgreementScoreExtractor'] ===
Executing Tokenizer
=== Queryable is executing on Sentence level, query = '' ===
Executing LiwcOneHot
=== Queryable is executing on Sentence level, query = '' ===
Executing FleschExtractor
=== Queryable is executing on Sentence level, query = '' ===
Executing TalkTurnExtractor
=== Queryable is executing on Paragraph level, query = '' ===
Executing AgreementScoreExtractor
=== Queryable is executing on Paragraph level, query = '' ===
=== Finished pipeline execution ===


In [105]:
display(test)

{<src.pipelinelib.extension.Extension at 0x7fdd2138d850>:      document_id  paragraph_id  sentence_id speaker  \
 0              0             0            0       A   
 1              0             1            1       B   
 2              0             2            2       A   
 3              0             3            3       B   
 4              0             4            4       A   
 ..           ...           ...          ...     ...   
 131            9            94          131       B   
 132            9            94          132       B   
 133            9            95          133       A   
 134            9            96          134       B   
 135            9            96          135       B   
 
                                                   text  
 0    [Ich, dachte, wir, reden, darüber, wie, wir, d...  
 1                                     [Erste, Wohnung]  
 2    [Ja, ich, dachte, das, ist, ein, nettes, Thema...  
 3                                   

In [None]:
document_df = queryable.execute(level=TextBody.DOCUMENT)
display(document_df)

In [None]:
paragraph_df = queryable.execute(level=TextBody.PARAGRAPH)
display(paragraph_df)

In [None]:
sentence_df = queryable.execute(level=TextBody.SENTENCE)
display(sentence_df)

queryable.execute(level=TextBody.SENTENCE)

In [None]:
pgs = queryable.by_couple_id(couple_id=27) \
    .is_depressed(d=False) \
    .execute(level=TextBody.DOCUMENT)
display(pgs)