Skip to content

Commit

Permalink
change corpora and datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
nlpzhezhao committed Oct 1, 2023
1 parent 53c3365 commit 0efe195
Show file tree
Hide file tree
Showing 5 changed files with 1,017 additions and 30,017 deletions.
34 changes: 17 additions & 17 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,32 @@ jobs:
pip install -r requirements.txt
- name: Test with pytest
run: |
python preprocess.py --corpus_path corpora/book_review_bert.txt --vocab_path models/google_zh_vocab.txt --dataset_path bert_dataset.pt --processes_num 8 --seq_length 64 --data_processor bert
python pretrain.py --dataset_path bert_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/bert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor bert
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/bert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
mv models/bert_model.bin-10 models/bert_model.bin
python preprocess.py --corpus_path corpora/book_review.txt --vocab_path models/google_zh_vocab.txt --dataset_path roberta_dataset.pt --processes_num 8 --dynamic_masking --seq_length 64 --data_processor mlm
python pretrain.py --dataset_path roberta_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/roberta_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --dynamic_masking --seq_length 64 --data_processor mlm
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/roberta_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm
mv models/roberta_model.bin-10 models/roberta_model.bin
python preprocess.py --corpus_path corpora/book_review_bert.txt --vocab_path models/google_zh_vocab.txt --dataset_path albert_dataset.pt --processes_num 8 --seq_length 64 --data_processor albert
python pretrain.py --dataset_path albert_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/albert/base_config.json --output_model_path models/albert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor albert
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/albert/base_config.json --output_model_path models/albert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
mv models/albert_model.bin-10 models/albert_model.bin
python preprocess.py --corpus_path corpora/book_review.txt --vocab_path models/google_zh_vocab.txt --dataset_path gpt2_dataset.pt --processes_num 8 --seq_length 64 --data_processor lm
python pretrain.py --dataset_path gpt2_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/gpt2/config.json --output_model_path models/gpt2_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor lm
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/gpt2/config.json --output_model_path models/gpt2_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
mv models/gpt2_model.bin-10 models/gpt2_model.bin
python preprocess.py --corpus_path corpora/book_review.txt --vocab_path models/google_zh_vocab.txt --dataset_path spanbert_dataset.pt --processes_num 8 --dynamic_masking --span_masking --seq_length 64 --data_processor mlm
python pretrain.py --dataset_path spanbert_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/spanbert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm
python preprocess.py --corpus_path corpora/CLUECorpusSmall_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --dynamic_masking --span_masking --seq_length 64 --data_processor mlm
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/spanbert_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --data_processor mlm --target mlm
mv models/spanbert_model.bin-10 models/spanbert_model.bin
python preprocess.py --corpus_path corpora/book_review_cls.txt --vocab_path models/google_zh_vocab.txt --dataset_path cls_dataset.pt --processes_num 8 --seq_length 64 --data_processor cls
python pretrain.py --dataset_path cls_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/cls_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --labels_num 2 --data_processor cls --target cls
python preprocess.py --corpus_path corpora/book_review_cls.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 64 --data_processor cls
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/cls_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2 --labels_num 2 --data_processor cls --target cls
mv models/cls_model.bin-10 models/cls_model.bin
python preprocess.py --corpus_path corpora/parallel_corpus_en_zh.txt --vocab_path models/google_uncased_en_vocab.txt --tgt_vocab_path models/google_zh_vocab.txt --dataset_path mt_dataset.pt --processes_num 8 --seq_length 64 --tgt_seq_length 64 --data_processor mt
python preprocess.py --corpus_path corpora/news-commentary-v13-en-zh_sampled.txt --vocab_path models/google_uncased_en_vocab.txt --tgt_vocab_path models/google_zh_vocab.txt --dataset_path mt_dataset.pt --processes_num 8 --seq_length 64 --tgt_seq_length 64 --data_processor mt
python pretrain.py --dataset_path mt_dataset.pt --vocab_path models/google_uncased_en_vocab.txt --tgt_vocab_path models/google_zh_vocab.txt --config_path models/transformer/base_config.json --output_model_path models/mt_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
mv models/mt_model.bin-10 models/mt_model.bin
python preprocess.py --corpus_path corpora/CLUECorpusSmall_5000_lines_bert.txt --vocab_path models/google_zh_vocab.txt --dataset_path pegasus_dataset.pt --processes_num 8 --seq_length 128 --tgt_seq_length 128 --dup_factor 1 --sentence_selection_strategy random --data_processor gsg
python pretrain.py --dataset_path pegasus_dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/pegasus/base_config.json --output_model_path models/pegasus_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
python preprocess.py --corpus_path corpora/CLUECorpusSmall_bert_sampled.txt --vocab_path models/google_zh_vocab.txt --dataset_path dataset.pt --processes_num 8 --seq_length 128 --tgt_seq_length 128 --dup_factor 1 --sentence_selection_strategy random --data_processor gsg
python pretrain.py --dataset_path dataset.pt --vocab_path models/google_zh_vocab.txt --config_path models/pegasus/base_config.json --output_model_path models/pegasus_model.bin --total_steps 10 --save_checkpoint_steps 10 --report_steps 2 --batch_size 2
mv models/pegasus_model.bin-10 models/pegasus_model.bin
python finetune/run_classifier.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/classifier_model.bin --train_path datasets/test_data/chnsenticorp_test/train.tsv --dev_path datasets/test_data/chnsenticorp_test/dev.tsv --epochs_num 3 --batch_size 2
python inference/run_classifier_infer.py --load_model_path models/classifier_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --test_path datasets/test_data/chnsenticorp_test/test_nolabel.tsv --prediction_path datasets/test_data/chnsenticorp_test/prediction.tsv --labels_num 2
python finetune/run_classifier.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/classifier_model.bin --train_path datasets/test_data/book_review/train.tsv --dev_path datasets/test_data/book_review/dev.tsv --epochs_num 3 --batch_size 2
python inference/run_classifier_infer.py --load_model_path models/classifier_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --test_path datasets/test_data/book_review/test_nolabel.tsv --prediction_path datasets/test_data/book_review/prediction.tsv --labels_num 2
python finetune/run_classifier.py --pretrained_model_path models/albert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/albert/base_config.json --output_model_path models/classifier_model.bin --train_path datasets/test_data/chnsenticorp_test/train.tsv --dev_path datasets/test_data/chnsenticorp_test/dev.tsv --learning_rate 4e-5 --epochs_num 3 --batch_size 2
python finetune/run_classifier_mt.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --dataset_path_list datasets/test_data/douban_test/ datasets/test_data/chnsenticorp_test/ --epochs_num 1 --batch_size 2
python finetune/run_ner.py --pretrained_model_path models/bert_model.bin --vocab_path models/google_zh_vocab.txt --config_path models/bert/mini_config.json --output_model_path models/ner_model.bin --train_path datasets/test_data/msra_ner_test/train.tsv --dev_path datasets/test_data/msra_ner_test/dev.tsv --label2id_path datasets/msra_ner/label2id.json --epochs_num 2 --batch_size 2
Expand Down
Loading

0 comments on commit 0efe195

Please sign in to comment.