In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import sent_tokenize

Reading the text `churchill.txt`

In [4]:
f = open('churchill.txt')
doc = f.read()
doc = doc.lower()

In [5]:
#lower case
doc = doc.lower()
for sent in sent_tokenize(doc):
    # remove stop words
    filtered_words = [word for word in word_tokenize(sent) if word not in stopwords.words('english')]
    
    #remove punctation
    filtered_words = [word for word in filtered_words if word.isalnum()]

Using the count vectorizer

In [7]:
count_vec = CountVectorizer(stop_words='english')
count_occurs = count_vec.fit_transform([doc])
counts = count_occurs.toarray()
words = np.array(count_vec.get_feature_names_out())


In [8]:
words

array(['aiding', 'air', 'apparatus', 'armed', 'beaches', 'believe',
       'british', 'carry', 'cause', 'comrades', 'confidence', 'cost',
       'death', 'defend', 'empire', 'end', 'europe', 'fail', 'fall',
       'fallen', 'famous', 'fields', 'fight', 'flag', 'fleet', 'forth',
       'france', 'french', 'gestapo', 'god', 'good', 'grip', 'grounds',
       'growing', 'guarded', 'hills', 'island', 'landing', 'large',
       'liberation', 'like', 'linked', 'moment', 'native', 'nazi', 'need',
       'new', 'oceans', 'odious', 'old', 'power', 'republic', 'rescue',
       'rule', 'seas', 'shall', 'soil', 'starving', 'states', 'steps',
       'streets', 'strength', 'struggle', 'subjugated', 'surrender',
       'time', 'tracts', 'utmost', 'world'], dtype=object)

In [9]:
df = pd.DataFrame(counts).T
df.index = words
df.columns = ['count']
df.sort_values(by='count',ascending=False,inplace=True)
df.head()

Unnamed: 0,count
shall,11
fight,7
island,2
old,2
growing,2


Using n-grams

In [11]:
count_vec = CountVectorizer(stop_words='english',ngram_range=(2,2))
count_occurs = count_vec.fit_transform([doc])

counts = count_occurs.toarray()
words = np.array(count_vec.get_feature_names_out())


In [12]:
df = pd.DataFrame(counts).T
df.index = words
df.columns = ['count']
df.sort_values(by='count',ascending=False,inplace=True)
df.head()

Unnamed: 0,count
shall fight,7
aiding like,1
rescue liberation,1
power steps,1
old famous,1


Reading the complete texts and organize in a matrix<br>
With the parameter `max_features` one has control over the size of the matrix

In [13]:
doc_split = []
for sent in sent_tokenize(doc):
    doc_split.append(sent)
doc_split=np.array(doc_split)   

In [14]:
count_vec = CountVectorizer(stop_words='english',max_features=4)
count_occurs = count_vec.fit_transform(doc_split)
counts_matrix = count_occurs.toarray()
words = np.array(count_vec.get_feature_names_out())

In [15]:
df_matrix = pd.DataFrame(counts_matrix)
df_matrix.columns = words
df_matrix.index = ['sent. '+str(i) for i in np.arange(1,counts_matrix.shape[0]+1)]
df_total = df_matrix.sum(axis=0).to_frame().T
df_total.index = ['total']
df_matrix=pd.concat([df_matrix,df_total],axis=0)


In [16]:
df_matrix.sort_values(by='total',axis=1,ascending=False)

Unnamed: 0,shall,fight,island,large
sent. 1,0,0,0,0
sent. 2,1,0,0,1
sent. 3,5,3,1,0
sent. 4,5,4,1,1
total,11,7,2,2
