# Spacy Basics
Devin J. Cornell, June 2022

In [1]:
import pandas as pd

# Data Structure Basics

In [2]:
mylist = list(range(1000000))
myset = set(range(1000000))
%timeit 35983048 in mylist
%timeit 35983048 in myset

11 ms ± 60.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
45.5 ns ± 0.137 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [3]:
d = {1:2, 3:4}
d[1]

2

In [4]:
import collections

toks = ['a', 'a', 'b', 'c']
ctr = collections.Counter(toks)
ctr['d'] += 1
ctr

Counter({'a': 2, 'b': 1, 'c': 1, 'd': 1})

# Text Analysis
## Load Dataset

In [14]:
df = pd.read_csv('data/exit_survey.csv', skiprows=[1,2], usecols=['Q73', 'Q74', 'Q75', 'Q76.1'])
df = df.fillna('')
df.head(2)
good_things = list(df['Q73'])
bad_things = list(df['Q74'])
slowed_things = list(df['Q75'])
rec_things = list(df['Q76.1'])

In [15]:
good_things[0]

'The type of research I was working on and the flexibility I had on selecting which problems to focus on.'

In [7]:
import spacy # import the package
nlp = spacy.load('en_core_web_trf') # load a language model
nlp

<spacy.lang.en.English at 0x7faf9f0cc580>

In [31]:
for text in good_things[:3]:
    doc = nlp(text)
    print(doc)
    print([t for t in doc])

The type of research I was working on and the flexibility I had on selecting which problems to focus on.
[The, type, of, research, I, was, working, on, and, the, flexibility, I, had, on, selecting, which, problems, to, focus, on, .]
All. Great advisor, atmosphere, social acitivities, research opportunities. It was a great time. 
[All, ., Great, advisor, ,, atmosphere, ,, social, acitivities, ,, research, opportunities, ., It, was, a, great, time, .]
Rich discussions and interactions with faculty members at the very top of their field.
[Rich, discussions, and, interactions, with, faculty, members, at, the, very, top, of, their, field, .]


In [32]:
good_docs = [nlp(text) for text in good_things]

In [34]:
good_docs[:2]

[The type of research I was working on and the flexibility I had on selecting which problems to focus on.,
 All. Great advisor, atmosphere, social acitivities, research opportunities. It was a great time. ]

## Create Token Sequences

Check out [Token](https://spacy.io/api/token) documentation.

In [55]:
for doc in good_docs[:3]:
    print([tok.lower_ for tok in doc])
    
    # filter stopwords, punctuation, and numbers, get lower-case
    print([tok.lower_ for tok in doc if not tok.is_stop and not tok.is_punct and not tok.like_num])
    print('-----------------')

['the', 'type', 'of', 'research', 'i', 'was', 'working', 'on', 'and', 'the', 'flexibility', 'i', 'had', 'on', 'selecting', 'which', 'problems', 'to', 'focus', 'on', '.']
['', '', '', '', '', '', '']
-----------------
['all', '.', 'great', 'advisor', ',', 'atmosphere', ',', 'social', 'acitivities', ',', 'research', 'opportunities', '.', 'it', 'was', 'a', 'great', 'time', '.']
['', '', '', '', '', '', '', '', '']
-----------------
['rich', 'discussions', 'and', 'interactions', 'with', 'faculty', 'members', 'at', 'the', 'very', 'top', 'of', 'their', 'field', '.']
['', '', '', '', '', '']
-----------------


In [54]:
good_toks = list()
for doc in good_docs:
    good_toks.append([tok.lower_ for tok in doc if not tok.like_num])
print(len(good_toks))
good_toks[0][:5]

3242


['the', 'type', 'of', 'research', 'i']

## Named Entity Recognition

In [61]:
all_ents = set()
for doc in good_docs:
    for tok in doc:
        if tok.ent_type_ != '' and not tok.like_num and not tok.is_punct:
            all_ents.add((tok.text, tok.ent_type_))
list(all_ents)[:3]

[('Literature', 'ORG'), ('Scholar', 'ORG'), ('Complex', 'ORG')]

# Text as Matrices

In [65]:
good_docs = [[tok.lower_ for tok in doc if not tok.like_num] for doc in nlp.pipe(good_things)]
bad_docs = [[tok.lower_ for tok in doc if not tok.like_num] for doc in nlp.pipe(bad_things)]
slowed_docs = [[tok.lower_ for tok in doc if not tok.like_num] for doc in nlp.pipe(slowed_things)]
rec_docs = [[tok.lower_ for tok in doc if not tok.like_num] for doc in nlp.pipe(rec_things)]

#good_docs = list(nlp.pipe(good_things))
#good_docs = list(nlp.pipe(good_things))
#good_docs = list(nlp.pipe(good_things))

#good_toks = list()
#for doc in good_docs:
#    good_toks.append([tok.lower_ for tok in a if not tok.like_num])
#print(len(good_toks))
#good_toks[0][:5]

In [64]:
#good_things = list(df['Q73'])
#bad_things = list(df['Q74'])
#slowed_things = list(df['Q75'])
#rec_things = list(df['Q76.1'])

good_docs = list(nlp.pipe(good_things))
good_docs = list(nlp.pipe(good_things))
good_docs = list(nlp.pipe(good_things))
good_docs = list(nlp.pipe(good_things))

[The type of research I was working on and the flexibility I had on selecting which problems to focus on.,
 All. Great advisor, atmosphere, social acitivities, research opportunities. It was a great time. ,
 Rich discussions and interactions with faculty members at the very top of their field.,
 The grant writing class was fantstic.  The student activities and seminars are well done.  I'm VERY glad they didn't make us take other courses since there were so few that were offered that were relevant to my research. ,
 Social aspects: no issues with other students, great opportunities to talk to others about science and non-science.,
 Faculty were always very kind and ready to provide advice and answer questions. The experience of working with the library system was wonderful. I met some good friends here.,
 ,
 Academic training,
 My program underwent significant changes during my time here. What pleased me the most has been seeing the department improve in all respects. I also found the o