In [12]:
#setup
import warnings; warnings.simplefilter('ignore')
%matplotlib notebook
# set this to your working directory
WORKING_DIR = '/home/elliott/Dropbox/_Ash_Teaching/2018-09 - Bocconi - Text Data and ML/code'
import os
os.chdir(WORKING_DIR)
import pandas as pd
df1 = pd.read_csv('death-penalty-cases.csv')

text = "Prof. Milano hailed from Milano. She got 3 M.A.'s from Bocconi."

In [13]:
###################################
# Splitting into sentences
###################################

from nltk import sent_tokenize
sentences = sent_tokenize(text) # split document into sentences
print(sentences)

['Prof. Milano hailed from Milano.', 'She got 3 M.A.', "'s from Bocconi."]


In [14]:
import spacy
nlp = spacy.load('en')
doc = nlp(text)
sentences = list(doc.sents)
print(sentences)

[Prof. Milano hailed from Milano., She got 3 M.A.'s from Bocconi.]


In [15]:
# Capitalization
text_lower = text.lower() # go to lower-case

In [16]:
#####
# Punctuation
#####

# recipe for fast punctuation removal
from string import punctuation
translator = str.maketrans('','',punctuation) 
text_nopunc = text_lower.translate(translator)
print(text_nopunc)

prof milano hailed from milano she got 3 mas from bocconi


In [17]:
# Tokens
tokens = text_nopunc.split() # splits a string on white space
print(tokens)

['prof', 'milano', 'hailed', 'from', 'milano', 'she', 'got', '3', 'mas', 'from', 'bocconi']


In [18]:
# Numbers
# remove numbers (keep if not a digit)
no_numbers = [t for t in tokens if not t.isdigit()]
# keep if not a digit, else replace with "#"
norm_numbers = [t if not t.isdigit() else '#' 
                for t in tokens ]
print(no_numbers )
print(norm_numbers)

['prof', 'milano', 'hailed', 'from', 'milano', 'she', 'got', 'mas', 'from', 'bocconi']
['prof', 'milano', 'hailed', 'from', 'milano', 'she', 'got', '#', 'mas', 'from', 'bocconi']


In [19]:
# Stopwords
from nltk.corpus import stopwords
stoplist = stopwords.words('english') 
# keep if not a stopword
nostop = [t for t in norm_numbers if t not in stoplist]
print(nostop)

['prof', 'milano', 'hailed', 'milano', 'got', '#', 'mas', 'bocconi']


In [20]:
# Stemming
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('german') # snowball stemmer, german
print(stemmer.stem("Autobahnen"))
stemmer = SnowballStemmer('english') # snowball stemmer, english
# remake list of tokens, replace with stemmed versions
tokens_stemmed = [stemmer.stem(t) for t in tokens]
print(tokens_stemmed)

autobahn
['prof', 'milano', 'hail', 'from', 'milano', 'she', 'got', '3', 'mas', 'from', 'bocconi']


In [21]:
# Corpus statistics
df1 = pd.read_csv('death-penalty-cases.csv')
docs = df1['snippet']

print(len(sentences),'sentences in corpus.')
print(len(tokens),'words in corpus.')
words_per_sent = len(tokens) / len(sentences)
print(words_per_sent,'words per sentence.')

2 sentences in corpus.
11 words in corpus.
5.5 words per sentence.


In [22]:
# Bag of words representation
from collections import Counter
freqs = Counter(tokens)
freqs.most_common()[:20]

[('milano', 2),
 ('from', 2),
 ('prof', 1),
 ('hailed', 1),
 ('she', 1),
 ('got', 1),
 ('3', 1),
 ('mas', 1),
 ('bocconi', 1)]

In [1]:
from nltk.stem import WordNetLemmatizer

In [2]:
wnl = WordNetLemmatizer()

In [3]:
wnl.lemmatize('corporation')

'corporation'

In [4]:
wnl.lemmatize('corporations')

'corporation'