In [10]:
import docx
import os

Loading the data: getting full text from all the .docx interview files

In [33]:
#function to get all text from a file 
def getText(filename):
    doc = docx.Document(filename)
    fullText = ""
    for para in doc.paragraphs: 
        fullText = fullText + para.text + " "
    return fullText

In [35]:
#change working directory
interview_folder = "C:\\Users\\605665\\Documents\\Student2Student\\Interview Notes"
os.chdir(interview_folder)
os.getcwd()

'C:\\Users\\605665\\Documents\\Student2Student\\Interview Notes'

In [36]:
allText = [] #list for all the interview text 

In [39]:
# function to loop through all the interview notes documents 
for filename in os.listdir(interview_folder):
    allText.append(getText(filename))

Pre-processing:
- Tokenization: split text, lowercase, remove puncutation 
- removing words with fewer than 3 characters
- removing stopwords 
- lemmatized: grouping different inflections of words together 
- stemmed: words reduced to root form 

In [41]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\605665\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [45]:
#lemmatize example
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [47]:
#create a new instance of an english stemmer
stemmer = SnowballStemmer('english')

In [48]:
# lemmatization and stemming
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [49]:
#testing preprocessing on a doc 
doc_sample = allText[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', 'since', 'he', 'can', 'remember,', 'first', 'was', 'DC', 'and', 'then', 'China.', 'From', 'China', '(4', 'years)', 'moved', 'to', 'Malaysia', '(3', '½', 'years)', 'moved', 'to', 'Colorado.', '9', 'when', 'he', 'moved', 'to', 'Malaysia.', 'Older', 'siblings', 'had', 'the', 'worst', 'of', 'it?', '(think', 'this', 'is', 'what', 'he', 'said)', '', 'Academically', 'the', 'problem', 'was', 'that', 'every', 'different', 'school', 'had', 'a', 'way', 'of', 'teaching', 'it.', 'Learned', 'the', 'same', 'stuff', 'over', 'and', 'over', 'again', '(earth', 'science', '5', 'times', 'lol)', 'The', 'only', 'science', 'he', 'ever', 'had', 'in', 'elementary', 'and', 'middle', 'school.', 'Your', 'family', 'is', 'your', 'friends', 'when', 'you’re', 'in', 'the', 'military,', 'moving', 'to', 'a', 'new', 'school', 'every', '3', 'years', 'was', 'really', 'hard', 'because', 'you', 'make', 'good', 'friends', 'and', 'then', 'you', 'leave', 'them.', 'Sort', 'of', 'build', 'a', 'social', 'wa

In [51]:
processed_docs = [preprocess(doc) for doc in allText]
processed_docs[:3]

[['rememb',
  'china',
  'china',
  'year',
  'move',
  'malaysia',
  'year',
  'move',
  'colorado',
  'move',
  'malaysia',
  'older',
  'sibl',
  'worst',
  'think',
  'say',
  'academ',
  'problem',
  'differ',
  'school',
  'teach',
  'learn',
  'stuff',
  'earth',
  'scienc',
  'time',
  'scienc',
  'elementari',
  'middl',
  'school',
  'famili',
  'friend',
  'militari',
  'move',
  'school',
  'year',
  'hard',
  'good',
  'friend',
  'leav',
  'sort',
  'build',
  'social',
  'wall',
  'peopl',
  'struggl',
  'older',
  'brother',
  'date',
  'leav',
  'girlfriend',
  'sibl',
  'extrem',
  'social',
  'anxieti',
  'part',
  'autist',
  'stay',
  'contact',
  'peopl',
  'hard',
  'militari',
  'sanction',
  'hous',
  'neighborhood',
  'think',
  'experi',
  'famili',
  'know',
  'introduc',
  'area',
  'necessarili',
  'friend',
  'introduc',
  'peopl',
  'end',
  'friend',
  'build',
  'relationship',
  'standard',
  'educ',
  'teacher',
  'awar',
  'militari',
  'child',
  '