# Experiment organization
Goal: Gather results from all topic modeling experiments using many stemming treatments for a particular corpus, then creates some visualizations.
The expected tree structure for the corpus and experiments is as follows:
```
<corpus> # corpus name - 'tiger','rnc' or 'opencorpora'
├── <corpus>_<stemmer> # stemmer or lemmatization treatment name, see topic_modeling/stemming.py for language details
│   ├── <corpus_oracleAnalysis.gz # Only present when stemmer=='oracle', the annotation labels for the words in the corpus, in sequence to match up with state files
│   ├── <corpus>_<stemmer>.mallet # mallet import-dir --output
│   ├── <corpus>_<stemmer>.tsv # output of topic_modeling/corpus_preprocessing.py for this particular treatment
│   ├── <corpus>_<stemmer>_<num_topics>_topics_<num_iters>_iters_<experiment_id> 
│   │   ├── <corpus>_<stemmer>_diagnostics.tsv # mallet train-topics metrics as tsv
│   │   ├── <corpus>_<stemmer>_diagnostics.xml # original mallet train-topics metrics format
│   │   ├── <corpus>_<stemmer>_doc_topics.txt # mallet train-topics --output-doc-topics 
│   │   ├── <corpus>_<stemmer>_entropy_metrics.tsv # metrics produced by by topic_modeling/mallet_parser slot-entropy
│   │   ├── <corpus>_<stemmer>.model # mallet topic model 
│   │   ├── <corpus>_<stemmer>_postLemmatized_diagnostics.tsv # mallet train-topics metrics for the version of this model lifted to lemmas 
│   │   ├── <corpus>_<stemmer>_postLemmatized_diagnostics.xml 
│   │   ├── <corpus>_<stemmer>_postLemmatized.mallet # mallet corpus sequence file lifted to lemmas (we probably only need to create this once, but I didn't think of that earlier, so for now each experiment gets its own)
│   │   ├── <corpus>_<stemmer>_postLemmatized.model # mallet topic model lifted to lemmas
│   │   ├── <corpus>_<stemmer>_postLemmatized_state.gz # mallet topic model state file lifted to 
│   │   ├── <corpus>_<stemmer>_state.gz # mallet topic model state file
│   │   ├── <corpus>_<stemmer>_top_docs.txt # mallet train-topics --output-doc-topics 
│   │   ├── <corpus>_<stemmer>_topic_keys.txt # mallet train-topics --output-topic-keys
│   │   ├── <corpus>_<stemmer>_topic_lemmas.tsv # Counts and conditional probablilities of lemmas for each topic
│   │   ├── <corpus>_<stemmer>_topic_pos.tsv  # Counts and conditional probablilities of parts-of-speech for each topic
│   │   ├── <corpus>_<stemmer>_topic_slots.tsv  # Counts and conditional probablilities of detailed morphological analyses for each topic
│   │   └── <corpus>_<stemmer>_top_terms.tsv # Raw counts of top 20 terms for each topic
└── voi_<num_topics>_topics # Variation of information between different models for the same number of topics
    └── <corpus>_<stemmer1>_<experiment1_id>_<corpus>_<stemmer2>_<experiment2_id>.tsv # Compares treatment 1 and treatment 2 

```



In [1]:
from pathlib import Path

corpus_name = "tiger"
corpus_root = Path("/home/virginia/workspace/topic-modeling-study") / corpus_name
experiment_folders = list(corpus_root.glob(f"{corpus_name}_*/*_topics_*_iters_*"))
voi_folders = list(corpus_root.glob(f"voi_*"))
print("Num experiment folders found:", len(experiment_folders))
print("VOI folders:", voi_folders)

Num experiment folders found: 107
VOI folders: [PosixPath('/home/virginia/workspace/topic-modeling-study/tiger/voi_100_topics'), PosixPath('/home/virginia/workspace/topic-modeling-study/tiger/voi_50_topics')]


In [6]:
import pandas as pd
def parse_experiment_directory(experiment_path):
    dir_name = experiment_path.name
    split_name = dir_name.split("_")
    corpus = split_name[0]
    treatment = split_name[1]
    num_topics = split_name[2]
    experiment_id = split_name[-1]
    original_mallet_diagnostics = pd.read_csv(experiment_path / f"{corpus}_{treatment}_diagnostics.tsv", sep="\t", index_col="id")

    final_frame = original_mallet_diagnostics

    final_frame.insert(loc = 0, column="corpus", value = corpus)
    final_frame.insert(loc = 1, column="stemmer", value=treatment)
    final_frame.insert(loc = 2, column="experiment_id", value = experiment_id)

    lemma_mallet_diagnostics = pd.read_csv(experiment_path / f"{corpus}_{treatment}_postLemmatized_diagnostics.tsv", sep="\t", index_col="id")
    lemma_mallet_diagnostics.rename(columns = {"exclusivity":"lemma_exclusivity"}, inplace=True)
    
    final_frame["lemma_exclusivity"] = lemma_mallet_diagnostics["lemma_exclusivity"]

    entropy_metrics = pd.read_csv(experiment_path / f"{corpus}_{treatment}_entropy_metrics.tsv", sep = "\t", index_col="topic")
    entropy_metrics["num_lemmas_in_top_20_terms"] = len(entropy_metrics["lemmas_in_20_terms"])

    final_frame = pd.merge(final_frame, entropy_metrics, left_index= True, right_index = True)


    return final_frame


In [7]:
test_frame = parse_experiment_directory(experiment_folders[0])
test_frame.columns
test_frame.head()

Unnamed: 0_level_0,corpus,stemmer,experiment_id,tokens,document_entropy,word-length,coherence,uniform_dist,corpus_dist,eff_num_words,...,slot_entropy,pos_entropy,lemmas_to_top_20_surface_forms,slots_to_top_20_surface_forms,pos_to_top_20_surface_forms,top_20_term_set,top_20_lemma_set,lemmas_in_20_terms,top_lemmas_minus_top_term_lemmas,num_top_lemmas_excluded_by_top_terms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,tiger,snowball,8,5495.0,4.9217,6.55,-390.2468,4.1912,3.1576,297.1045,...,5.847058,2.268032,2.1,3.25,0.3,"{'deutschland', 'mensch', 'volk', 'evangel', '...","{'Volk', 'EKD', 'Welt', 'sozial', 'Krieg', 'Ki...","{'verantworten', 'Evangele', 'Volk', 'Politike...","{'EKD', 'Christ'}",2
1,tiger,snowball,8,3142.0,3.8596,5.75,-545.4456,4.1048,3.8789,450.1671,...,5.725491,2.166171,1.55,2.9,0.25,"{'land', 'mexiko', 'sri', 'san', 'sud', 'stadt...","{'Mexiko', 'Meer', 'Land', 'Süden', 'Stadt', '...","{'Mexiko', 'Meer', 'Land', 'Philippinen', 'Nor...","{'Jaffna', 'Chamula'}",2
2,tiger,snowball,8,7373.0,5.0547,6.5,-313.2359,4.1087,2.8627,322.3903,...,5.616719,2.050576,1.45,3.0,0.35,"{'weltweit', 'siem', 'umsatz', 'million', 'erl...","{'weltweit', 'Unternehmen', 'Million', 'Aktion...","{'weltweit', 'erlösen', 'Unternehmen', 'Millio...","{'Belegschaft', 'übernehmen'}",2
3,tiger,snowball,8,9029.0,4.2369,8.4,-248.3175,4.1628,2.6433,273.8653,...,5.710308,2.005119,1.95,3.3,0.35,"{'deutschland', 'mensch', 'offent', 'arbeit', ...","{'Arbeit', 'Unternehmen', 'sozial', 'ökonomisc...","{'Arbeit', 'Politiker', 'arbeiten', 'National'...",set(),0
4,tiger,snowball,8,4267.0,4.3778,7.6,-374.0873,4.4821,3.4902,184.7049,...,5.661638,1.796045,2.15,3.65,0.35,"{'arbeitgeb', 'kundig', 'arbeitnehm', 'arbeit'...","{'Arbeit', 'Unternehmen', 'Medium', 'Gewerksch...","{'Arbeit', 'kundig', 'betreiben', 'arbeiten', ...","{'Medium', 'Arbeitszeit', 'DGB', 'schaffen'}",4
