In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
pd.set_option('display.max_rows', 10) 
import spacy
from IPython.display import display

from src.pipelinelib.querying import Parser, Queryable

folder = os.path.join(os.getcwd(), "data", "transcripts")
files = [os.path.join(root, f) for root, _, files in os.walk(folder)
         for f in files if f.endswith(".docx")]


nlp = spacy.load("de_core_news_sm", disable=["ner", "parser"])
parser = Parser(
    nlp=nlp, metadata_path="./data/transcripts/Kopie von Transkriptionspaare_Daten.xls")

parser.read_from_files(files)
display(parser.frame)


=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 81_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 87_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 182_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 138_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 47_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 105_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 29_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 27_T1_IM_FW.docx ===
=== Parser: reading from /home/rise/Schreibtisch/Sigmund_git/sigmund/data/transcripts/Paar 58_T1_IM_F

Unnamed: 0,document_id,paragraph_id,sentence_id,couple_id,speaker,gender,is_depressed_group,depressed_person,text,hamilton
0,0,0,0,81,A,W,False,,Ich dachte wir reden darüber wie wir die erste...,2
1,0,1,1,81,B,M,False,,Erste Wohnung?,4
2,0,2,2,81,A,W,False,,"Ja, ich dachte das ist ein nettes Thema, oder?",2
3,0,3,3,81,B,M,False,,Ja.,4
4,0,4,4,81,A,W,False,,"Ich fand das süß, wie du erst angefangen hast ...",2
...,...,...,...,...,...,...,...,...,...,...
126,9,94,126,60,B,M,True,W,Hab ich ein Schild auf dem Rücken?,7
127,9,94,127,60,B,M,True,W,Ladet eure Probleme bei mir ab.,7
128,9,95,128,60,A,W,True,W,Das sind keine Probleme.,43
129,9,96,129,60,B,M,True,W,Kümmerst du dich drum.,7


In [4]:
from src.pipelinelib.querying import Parser, Queryable
from src.pipelinelib.text_body import TextBody
from src.pipelinelib.pipeline import Pipeline
queryable = Queryable.from_parser(parser)

In [10]:
#from src.sigmund import adapter
from src.sigmund.extensions import *
from src.sigmund.preprocessing.words import Tokenizer, Stemmer, Lemmatizer
from src.sigmund.features import agreement_score as fagree
from src.sigmund.features import flesch_reading_ease as fflesch
from src.sigmund.features import talk_turn as ftalkturn
from src.sigmund.features import liwc as fliwc
from src.sigmund.features import pos as fpos
from src.sigmund.features import tfidf as ftfidf
from src.sigmund.preprocessing import words as pwords
from src.sigmund.classification import merger
liwc_dict_path = r"/home/rise/Schreibtisch/Sigmund_git/sigmund/data/German_LIWC2001_Dictionary.dic"

pipeline = Pipeline(queryable = queryable, empty_pipeline=True) 
pipeline.add_component(pwords.Tokenizer())
pipeline.add_component(fliwc.Liwc(white_list = ['Death', 'Metaph']))
pipeline.add_component(fpos.PartOfSpeech())
pipeline.add_component(Stemmer())
pipeline.add_component(Lemmatizer())
#pipeline.add_component(ftfidf.FeatureTFIDF())
pipeline.add_component(fflesch.FleschExtractor())
pipeline.add_component(ftalkturn.TalkTurnExtractor())
pipeline.add_component(fagree.AgreementScoreExtractor())
#pipeline.add_component(merger.FeatureMerger())
pipeline.add_component(fliwc.Liwc_Inverse(category = ['Death']))
pipeline.add_component(fliwc.Liwc_Trend(category = ['Posemo','Death', 'Metaph']))
#.add_component(adapter.Adapter(old = TOKENS_sese, new = TALKTURN))\


<src.pipelinelib.pipeline.Pipeline at 0x7f48b9db7280>

In [11]:
test = pipeline.execute(visualise = False)

=== Starting pipeline with ['Tokenizer', 'Liwc', 'PartOfSpeech', 'Stemmer', 'Lemmatizer', 'FleschExtractor', 'TalkTurnExtractor', 'AgreementScoreExtractor', 'Liwc_Inverse', 'Liwc_Trend'] ===
Executing Tokenizer
Executing Liwc
Executing PartOfSpeech
Executing Stemmer
Executing Lemmatizer
Executing FleschExtractor
Executing TalkTurnExtractor
Executing AgreementScoreExtractor
Executing Liwc_Inverse
Executing Liwc_Trend
=== Finished pipeline execution ===


In [12]:
display(test)

{tokens_sentence(preprocessing):      couple_id speaker gender  is_depressed_group  document_id  paragraph_id  \
 0           81       A      W               False            0             0   
 1           81       B      M               False            0             1   
 2           81       A      W               False            0             2   
 3           81       B      M               False            0             3   
 4           81       A      W               False            0             4   
 ..         ...     ...    ...                 ...          ...           ...   
 126         60       B      M                True            9            94   
 127         60       B      M                True            9            94   
 128         60       A      W                True            9            95   
 129         60       B      M                True            9            96   
 130         60       B      M                True            9            96

In [None]:
document_df = queryable.execute(level=TextBody.DOCUMENT)
display(document_df)

In [None]:
paragraph_df = queryable.execute(level=TextBody.PARAGRAPH)
display(paragraph_df)

In [None]:
sentence_df = queryable.execute(level=TextBody.SENTENCE)
display(sentence_df)

queryable.execute(level=TextBody.SENTENCE)

In [None]:
pgs = queryable.by_couple_id(couple_id=27) \
    .is_depressed(d=False) \
    .execute(level=TextBody.DOCUMENT)
display(pgs)