Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

110 lines (109 sloc) 3.926 kb
---
# application main params
general:
# where all paths and following files are located
basedirectory: '.'
locale: 'en_US.UTF-8'
# following directories or files a under "basedirectory"
index: 'index'
dbenv: 'db'
# log options, file is "pytextminer-log.txt"
logsize: 1024000
loglevel: 'debug'
user: 'sessions'
shared: 'shared'
whitelist_directory: 'whitelists'
source_file_directory: 'source_files'
userstopwords: 'user_stopwords.csv'
# stopwords are under share
stopwords: 'stopwords/en.txt'
# extraction settings
datasets:
doc_extraction:
# the following values can be document object fields:
# - defined by one of the field's value
# - constants required fields : 'content'/'label'/'id'
- 'title'
- 'content'
#- 'keywords'
# tina csv columns declaration
# will ignore undeclared fields
# and warn not found optional fields
tinacsv:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: 'acronym'
fields:
# required fields
label: 'doc_id'
content: 'abstract'
corpus_id: 'corp_id'
id: 'doc_id'
# optionnal fields
keywords: 'keywords'
title: 'title'
acronym: 'acronym'
# csv reader params
encoding: 'utf_8'
dialect: 'excel'
# pubmed.gov "medline" file export
medline:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: 'title'
fields:
label: 'PMID'
content: 'AB'
corpus_id: 'DP'
id: 'PMID'
# optional fields
title: 'TI'
# period_size = number first characters of pub date field "corpusField"
period_size: 4
encoding: 'ascii'
# archive of pubmed.gov "medline" files, organized like this
# Medline/
# period1/period1.txt
# period2/period2.txt
#medlinearchive:
# TODO
# extraction size
ngramMin: 1
ngramMax: 4
# tagger learning on 2 x training_tagger_size sentences of tagged nltk corpus
# to train on the whole corpora, mark ~ instead of a number of phrases
training_tagger_size: ~
# under "basedirectory" directory, delete this file to regenerate a new one on next starting
tagger: 'shared/tagger.pickle'
# change this Reg Expression to change NGram extraction filtering
postag_valid: '^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?)+?)*?$'
#postag_valid: '^((VB,|VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)(CD.,)?)+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VB,|VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,))+?)*?$'
#postag_valid: '^((VB.?,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,))+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VB.?,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,))+?)*?$'
#postag_valid: '^((VB.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,))+?((IN.?,|CC.?,|\?,)((VB.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,))+?)*?$'
datamining:
template: 'shared/gexf/gexf.default.template'
# default values if no params are passed to gexf exporter
DocumentGraph:
edgethreshold:
- 0.0001
- 'inf'
nodethreshold:
- 1.00
- 'inf'
#proximity: 'sharedNGrams'
proximity: 'logJaccard'
maxdegree: 100
NGramGraph:
edgethreshold:
- 0.0001
- 'inf'
nodethreshold:
- 1.00
- 'inf'
alpha: 0.10
#hapax: 1
proximity: 'Cooccurrences'
#proximity: 'EquivalenceIndex'
#proximity: 'PseudoInclusion'
indexer:
minCooc: 10
Jump to Line
Something went wrong with that request. Please try again.