## Extract text from doc

In [1]:
path = "/Users/eunji/Desktop/rename/nonadia.docx"

## 1. using textract

In [7]:
import textract
text = textract.process(path,language='eng').decode('utf-8')
print(text[:200])

Efficient and % = Accurate Nonadiabatic accurately Molecular Dynamics with Spin-Flip TDDFT

We propose an efficient and accurate nonadiabatic molecular dynamics (NAMD) method using spin-flip time-depe


## 2. Select words

In [3]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
words = word_tokenize(text)
print(words[:20])
#convert to lower case
tokens = [w.lower() for w in words] 
print("=====Convert to lowercase=====")
print(tokens[:20])
#Remove functuations
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
stripped = [re_punc.sub('',w) for w in tokens]
print("=====Remove punctuations=====")
print(stripped[:20])
#Remove renaming tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print("=====Keep alphabetic=====")
print(words[:20])
#Remove "English" stopwords
stop_words = set(stopwords.words("English"))
words = [w for w in words if not w in stop_words]
print("=====Remove 'English' stopwords=====")
print(words[:20])

['Efficient', 'and', '%', '=', 'Accurate', 'Nonadiabatic', 'accurately', 'Molecular', 'Dynamics', 'with', 'Spin-Flip', 'TDDFT', 'We', 'propose', 'an', 'efficient', 'and', 'accurate', 'nonadiabatic', 'molecular']
=====Convert to lowercase=====
['efficient', 'and', '%', '=', 'accurate', 'nonadiabatic', 'accurately', 'molecular', 'dynamics', 'with', 'spin-flip', 'tddft', 'we', 'propose', 'an', 'efficient', 'and', 'accurate', 'nonadiabatic', 'molecular']
=====Remove punctuations=====
['efficient', 'and', '', '', 'accurate', 'nonadiabatic', 'accurately', 'molecular', 'dynamics', 'with', 'spinflip', 'tddft', 'we', 'propose', 'an', 'efficient', 'and', 'accurate', 'nonadiabatic', 'molecular']
=====Keep alphabetic=====
['efficient', 'and', 'accurate', 'nonadiabatic', 'accurately', 'molecular', 'dynamics', 'with', 'spinflip', 'tddft', 'we', 'propose', 'an', 'efficient', 'and', 'accurate', 'nonadiabatic', 'molecular', 'dynamics', 'namd']
=====Remove 'English' stopwords=====
['efficient', 'accurat

## Feature extraction

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(words)
#summarize
print(vectorizer.vocabulary_)


{'efficient': 423, 'accurate': 10, 'nonadiabatic': 885, 'accurately': 11, 'molecular': 835, 'dynamics': 412, 'spinflip': 1273, 'tddft': 1358, 'propose': 1068, 'namd': 865, 'method': 816, 'using': 1457, 'timedependent': 1388, 'density': 329, 'functional': 570, 'theory': 1371, 'sftddft': 1210, 'new': 879, 'state': 1281, 'tracking': 1399, 'algorithm': 28, 'calculating': 136, 'coupling': 293, 'terms': 1365, 'nact': 862, 'provides': 1071, 'potential': 1024, 'energy': 447, 'curves': 305, 'near': 871, 'conical': 242, 'intersection': 689, 'linearresponse': 756, 'test': 1366, 'simulation': 1232, 'geometrical': 580, 'changes': 160, 'time': 1385, 'typical': 1434, 'geometries': 581, 'around': 71, 'lifetime': 748, 'good': 590, 'agreement': 23, 'results': 1162, 'expensive': 490, 'ab': 0, 'initio': 674, 'multiple': 855, 'spawning': 1263, 'improved': 641, 'accuracy': 9, 'dramatically': 403, 'following': 536, 'phase': 985, 'alignments': 32, 'orbitals': 932, 'well': 1494, 'ordering': 935, 'improvement':