Merge branch 'release/0.0.1'

dermatologist · Mar 18, 2019 · c00274f · c00274f
2 parents 348c270 + 4940ca1
commit c00274f
Show file tree

Hide file tree

Showing 21 changed files with 852 additions and 327 deletions.
diff --git a/README.md b/README.md
@@ -1,46 +1,55 @@
-# QRMine
+# :flashlight: QRMine
 
-[![QRMine](https://raw.github.com/E-Health/nlp-qrmine/master/notes/QR.jpg)](http://canehealth.com)
+QRMine is a suite of qualitative research (QR) support tools in Python using Natural Language Processing (NLP) and Machine Learning (ML). QRMine is still work in progress and is not ready for use.
 
-QRMine is a suite of qualitative research (QR) support tools in Python
-using NLP. Currently QRMine include:
+## What it does
 
-  - gtdict : Generates a coding dictionary based on available data
-    (Grounded Theory)
-  - nnet : Evaluate the accuracy of an ANN with the given set of IV and
-    one DV (Theory Building)
-  - sentiment : Create the CNN model for sentiment analysis.
-  - run\_sentiment : Use the CNN model created by sentiment module for
-    prediction.
+### NLP
+* Lists common categories for open coding.
+* Create a coding dictionary with categories, properties and dimensions.
+* Topic modelling.
+* Arrange docs according to topics.
+* Compare two documents/interviews.
+* Sentiment analysis
+* Network analysis
+* Co-citation finder
 
-\* cocite: Find the cocitation frequency for biomedical literature using
-NCBI's EUtils. And More to come. (work in progress)
+### ML
+* Accuracy of a neural network model trained using the data.
+* Confusion matrix from an support vector machine classifier
+* K nearest neighbours of a given record.
+* K-Means clustering
+* Association rules.
 
-## How to Install: 
+## How to use
 
-checkout this repo and
+* Download/clone this repository
+* pip install -r requirements.txt
+* python qrmine.py ( --help to display all command line options)
 
-```
-python setup.py sdist
-
-OR
-
-python setup.py bdist
+## Input file format
 
-OR 
-
-python setup.py bdist_wheel
+### NLP
+Individual documents or interview transcripts in a single text file separated by <break>Topic</break>. Example below
 
+```
+Text of the first interview
+<break> First interview with student 1 </break>
+Text of the second interview
+<break> Second interview with tutor 1 </break>
 ```
 
+Multiple files are suported, each having only one break tag at the bottom with the topic.
+(The tag may be renamed in the future)
 
-## How to use:
+### ML
 
-[Read](https://stackoverflow.com/questions/6292652/what-is-the-difference-between-an-sdist-tar-gz-distribution-and-an-python-egg)
+A single csv file with the following generic structure.
 
-## Using Docker
+* Column 1 with identifier. If it is related to a text document as above, include the title.
+* Last column has the dependent variable (DV). (NLP algorithms like the topic asignments may be able to create DV)
+* All independent variables (numerical) in between.
 
-TBD
 
 ## Author
 
@@ -63,3 +72,5 @@ is an example BibTeX entry:
 }
 
 ```
+
+Publication with the theoretical foundations of this tool is being worked on. QRMine is inspired by [this work](https://github.com/lknelson/computational-grounded-theory) and the associated [paper](https://journals.sagepub.com/doi/abs/10.1177/0049124117729703).
diff --git a/README.rst b/README.rst
diff --git a/notes/notes.md b/notes/notes.md
@@ -53,4 +53,14 @@ setup(
     # ... other keys like project name, version, etc ...
     options = dict(egg_info = dict(tag_build = "dev_" + GIT_HEAD_REV)),
 )
-```
+```
+
+## Command line
+
+* https://pymbook.readthedocs.io/en/latest/click.html
+
+
+## Getters and setters
+
+class testDec(object):
+* And one more thing that is not completely easy to spot at first, is the order: The getter must be defined first.
diff --git a/qrmine.py b/qrmine.py
@@ -0,0 +1,111 @@
+import sys
+
+import click
+import textacy
+from textacy.vsm.vectorizers import Vectorizer
+
+from src.nlp_qrmine import Content
+from src.nlp_qrmine import Network
+from src.nlp_qrmine import Qrmine
+from src.nlp_qrmine import ReadData
+from src.nlp_qrmine import Sentiment
+from src.ml_qrmine import MLQRMine
+
+@click.command()
+@click.option('--verbose', '-v', is_flag=True, help="Will print verbose messages.")
+@click.option('--inp', '-i', multiple=True, default='',
+              help='Input file in the text format with <break> Topic </break>')
+@click.option('--out', '-o', multiple=False, default='',
+              help='Output file name')
+@click.option('--csv', '-c', multiple=False, default='',
+              help='csv file name')
+@click.option('--doc', '-d', multiple=True, default='',
+              help='Document(s) to analyze/compare')
+def cli(verbose, inp, out, csv, doc):
+    if verbose:
+        click.echo("We are in the verbose mode.")
+    if out:
+        sys.stdout = open(out, 'w')
+    if inp:
+        main(inp)
+
+
+def main(input_file):
+    # ML
+    ml = MLQRMine()
+    ml.csvfile = "src/ml_qrmine/diabetes-risk.csv"
+    ml.prepare_data()
+    print(ml.get_nnet_predictions())
+    print("\n%s: %.2f%%" % (ml.model.metrics_names[1], ml.get_nnet_scores()[1] * 100))
+
+    print(ml.svm_confusion_matrix())
+
+    print(ml.knn_search(3))
+
+    # content property returns the entire text and the documents returns the array of documents
+    data = ReadData()
+    data.read_file(input_file)
+
+    q = Qrmine()
+    all_interviews = Content(data.content)
+
+    ## Summary
+    print(" ".join(all_interviews.generate_summary(2)))
+    print("_________________________________________")
+
+    doc = textacy.Doc(all_interviews.doc)
+
+    ## Sentiment
+    s = Sentiment()
+    x = []
+    for sentence in doc.sents:
+        if len(sentence) > 3:
+            x.append(sentence.text)
+            sent = s.sentiment_analyzer_scores(sentence.text)
+            print("{:-<40} {}\n".format(sent["sentence"], str(sent["score"])))
+            print("{:-<40} {}\n".format(sentence.text, str(s.similarity(sentence.text, "Dummy sentence"))))
+
+    ## Network
+    n = Network()
+    print(n.sents_to_network(x))
+    # n.draw_graph(True)
+    print(n.draw_graph(False))
+
+    # create an empty corpus
+    en = textacy.load_spacy('en_core_web_sm', disable=('parser',))
+    corpus = textacy.Corpus(lang=en)
+
+    ct = 0
+    for document in data.documents:
+        metadata = {}
+        try:
+            metadata['title'] = data.titles[ct]
+        except IndexError:
+            metadata['title'] = 'Empty'
+        corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True),
+                        metadata=metadata)
+        ct += 1
+    vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth',
+                            norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
+    doc_term_matrix = vectorizer.fit_transform((documents.to_terms_list(ngrams=(1, 2, 3), named_entities=True,
+                                                                        as_strings=True, filter_stops=True,
+                                                                        filter_punct=True, filter_nums=True, min_freq=1)
+                                                for documents in corpus))
+    number_docs, terms = doc_term_matrix.shape
+    model = textacy.TopicModel('nmf', n_topics=number_docs)
+    model.fit(doc_term_matrix)
+
+    doc_topic_matrix = model.transform(doc_term_matrix)
+
+    _, number_topics = doc_topic_matrix.shape
+
+    print("_________________________________________")
+    print("QRMine(TM) Qualitative Research Miner. v" + q.get_git_revision_short_hash)
+    q.print_categories(doc)
+    q.print_topics(model, vectorizer, number_topics)
+    q.print_documents(model, corpus, doc_topic_matrix, number_topics)
+    q.print_dict(all_interviews)
+
+
+if __name__ == '__main__':
+    cli()  # run the main function
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,13 @@ preggy
 Keras
 keras-text
 spacy
-https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz
+textacy
+click
+vaderSentiment
+imbalanced-learn=0.4.3
+numpy
+matplotlib
+pandas
+xgboost
+#https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.1/en_core_web_md-1.2.1.tar.gz
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/misc_qrmine/__init__.py b/src/misc_qrmine/__init__.py
diff --git a/src/nlp_qrmine/cocite.py → src/misc_qrmine/cocite.py b/src/nlp_qrmine/cocite.py → src/misc_qrmine/cocite.py
@@ -5,7 +5,6 @@
 
 
 def main():
-
     print("Finding Articles citing the given reference.....")
 
     for var in sys.argv:
@@ -21,7 +20,7 @@ def main():
             articles.append(record['PMID'])
 
         # Remove duplicates: SO 7961363
-        articles = list(set(articles)) 
+        articles = list(set(articles))
         article_no = len(articles)
 
     print("Finding Co-citations. This may take several hours...........")
@@ -37,6 +36,5 @@ def main():
     print("-----------------------------------------")
 
 
-
 if __name__ == '__main__':  # if we're running file directly and not importing it
     main()  # run the main function
diff --git a/src/nlp_qrmine/run_sentiment.py → src/misc_qrmine/run_sentiment.py b/src/nlp_qrmine/run_sentiment.py → src/misc_qrmine/run_sentiment.py
@@ -10,7 +10,7 @@
 from keras.models import load_model
 from keras.preprocessing import sequence
 
-from src.nlp_qrmine.sentiment import process_msg
+from src.misc_qrmine.sentiment import process_msg
 
 model = load_model('data/classifier.h5')
 vocab = pickle.load(open('data/vocab.pkl', 'rb'))