## Getting started with scikit-talk

To get started, let's load some data:

In [40]:
from sktalk.corpus.parsing.cha import ChaFile
from sktalk.corpus import Corpus

parsed_cha = ChaFile('../data/02.cha').parse()

parsed_cha.utterances[0].metadata

This data is a conversation object. We want to extract the text from it.

In [42]:
#parsed_cha.utterances
#parsed_cha.metadata

# for utt in parsed_cha.utterances:
#     print(utt)
#     print(utt.metadata)

corpus_metadata = {
    'language': 'English',
    'dialect': 'Canadian English',
    'type': 'conversational speech',
}

english = Corpus(conversations=[],
                   metadata = corpus_metadata)

# example of use:
# use kwargs to pass metadata
# use args to pass conversations
english = Corpus(parsed_cha, other_parsed_cha, # should be empty on init  
                 language = "english",
                 dialect = "canadian english",
                 type = "conversational speech")

# the user could initialize a corpus without conversations
french1 = Corpus(language = "french1",
                other_meta = "more info",
                importer = "Andreas")

# then add conversations later in a loop
for file in folder:
    conversation = ChaFile(file)
    french1.addConversation(conversation)

#chacorpus.addConversation(parsed_cha)


#parsed_cha.write('../data/02.cha')

english.return_json('../data/02.json')

TypeError: Corpus.return_json() takes 1 positional argument but 2 were given

In [12]:
# We should be able to dump the entire parsed CHA file into a json

parsed_cha.to_json('../data/02.json')
# parsed_cha.to_csv('../data/02.csv')

# parsed_cha.summarize()

AttributeError: 'Conversation' object has no attribute 'to_json'

## The `from_convokit` module

Here, we import a single class from the `csv_to_json` module, and use it.

In [2]:
import sktalk.from_convokit as ck

In [1]:
import pandas as pd
df = pd.read_csv("../data/ulwa_testdata_convokit_format.csv")

Unnamed: 0,timestamp,speaker,text,translation,conversation_id,utterance_raw,reply_to
0,1332718704,Tang,U oughs inim tï samting yan,Lorem ipsum dolor sit amet.,/ulwa1/ulwa014,U oughs inim tï samting yangama ul matï akïnakape,
1,1332732704,Yan,mbam ndul ma wandam ana,At neque fugit eum reprehenderit labore et exe...,/ulwa1/ulwa014,wimbam ndul ma wandam anapa ol welunda nïkap t...,
2,1332743704,Tang,Mï inim wandam bai anapa nd,a veritatis tempore sit vitae quaerat sed cons...,/ulwa1/ulwa014,Mï inim wandam bai anapa ndïtï ka welunda unan,
3,1332754704,Yan,lunda we ndïmïne in,,/ulwa1/ulwa014,ata welunda we ndïmïne ind,
4,1332765704,Tang,kïnakape akïnaka,,/ulwa1/ulwa014,i akïnakape akïnakap,
5,1332776704,Yan,coughs ndïmïne we ndul wa le we ndïtï akïnakap...,Et illo facere vel magni necessitatibus est as...,/ulwa1/ulwa014,[coughs] I inim oughs ka lopop mananda bai kïk...,
6,1332787704,Tang,mananda,,/ulwa1/ulwa014,n mananda ndïtï ka akïnakape wimbam,
7,1332788704,Yan,da,,/ulwa1/ulwa014,da ndïtï ka,
8,1332789704,Tang,e kïkal awi akïnakape,onsequatur amet qui nisi facilis et perferendi...,/ulwa1/ulwa014,e kïkal awi akïnakape manï lï,
9,1332999704,Yan,atïm inim.,itae quaerat sed consequatur amet,/ulwa1/ulwa014,atïm inim.,


In [4]:
corpus = ck.Corpus.from_pandas(utterances_df = df)


ID column is not present in utterances dataframe, generated ID column from dataframe index...


10it [00:00, 4043.87it/s]


In [5]:
corpus.dump(name ='testcorpus', base_path="../data/")

## The `csv_to_json` module

Here, we import a single class from the `csv_to_json` module, and use it.

In [6]:
import sktalk.corpus as cj


In [7]:
corpus = cj.Corpus("../data/ulwa_testdata_sktalk_format.csv")
corpus.return_dataframe()
corpus.df

Unnamed: 0,begin,end,participant,utterance,translation,source,utterance_raw
0,00:00:00.917,00:00:05.604,Tang,U oughs inim tï samting yan,Lorem ipsum dolor sit amet.,/ulwa1/ulwa014,U oughs inim tï samting yangama ul matï akïnakape
1,00:00:04.830,00:00:09.080,Yan,mbam ndul ma wandam ana,At neque fugit eum reprehenderit labore et exe...,/ulwa1/ulwa014,wimbam ndul ma wandam anapa ol welunda nïkap t...
2,00:00:06.090,00:00:09.450,Tang,Mï inim wandam bai anapa nd,a veritatis tempore sit vitae quaerat sed cons...,/ulwa1/ulwa014,Mï inim wandam bai anapa ndïtï ka welunda unan
3,00:00:09.534,00:00:10.333,Yan,lunda we ndïmïne in,,/ulwa1/ulwa014,ata welunda we ndïmïne ind
4,00:00:10.333,00:00:11.143,Tang,kïnakape akïnaka,,/ulwa1/ulwa014,i akïnakape akïnakap
5,00:00:11.143,00:00:18.240,Yan,coughs ndïmïne we ndul wa le we ndïtï akïnakap...,Et illo facere vel magni necessitatibus est as...,/ulwa1/ulwa014,[coughs] I inim oughs ka lopop mananda bai kïk...
6,00:00:11.477,00:00:12.205,Tang,mananda,,/ulwa1/ulwa014,n mananda ndïtï ka akïnakape wimbam
7,00:00:14.390,00:00:15.696,Yan,da,,/ulwa1/ulwa014,da ndïtï ka
8,00:00:17.972,00:00:20.722,Tang,e kïkal awi akïnakape,onsequatur amet qui nisi facilis et perferendi...,/ulwa1/ulwa014,e kïkal awi akïnakape manï lï
9,00:00:18.240,00:00:21.970,Yan,atïm inim.,itae quaerat sed consequatur amet,/ulwa1/ulwa014,atïm inim.


In [8]:
corpus.return_json()

In [9]:
corpus.json

'{"begin":{"0":"00:00:00.917","1":"00:00:04.830","2":"00:00:06.090","3":"00:00:09.534","4":"00:00:10.333","5":"00:00:11.143","6":"00:00:11.477","7":"00:00:14.390","8":"00:00:17.972","9":"00:00:18.240"},"end":{"0":"00:00:05.604","1":"00:00:09.080","2":"00:00:09.450","3":"00:00:10.333","4":"00:00:11.143","5":"00:00:18.240","6":"00:00:12.205","7":"00:00:15.696","8":"00:00:20.722","9":"00:00:21.970"},"participant":{"0":"Tang","1":"Yan","2":"Tang","3":"Yan","4":"Tang","5":"Yan","6":"Tang","7":"Yan","8":"Tang","9":"Yan"},"utterance":{"0":"U oughs inim t\\u00ef samting yan","1":"mbam ndul ma wandam ana","2":"M\\u00ef inim wandam bai anapa nd","3":"lunda we nd\\u00efm\\u00efne in","4":"k\\u00efnakape ak\\u00efnaka","5":"coughs nd\\u00efm\\u00efne we ndul wa le we nd\\u00eft\\u00ef ak\\u00efnakape malimap mat\\u00ef yawa mananda","6":"mananda","7":"da","8":"e k\\u00efkal awi ak\\u00efnakape","9":"at\\u00efm inim."},"translation":{"0":"Lorem ipsum dolor sit amet.","1":"At neque fugit eum reprehe