In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
speech_df = pd.read_csv('./inaugural_speeches.csv')

In [4]:
speech_df['text'].head()

0    Fellow-Citizens of the Senate and of the House...
1    Fellow Citizens:  I AM again called upon by th...
2    WHEN it was first perceived, in early times, t...
3    Friends and Fellow-Citizens:  CALLED upon to u...
4    PROCEEDING, fellow-citizens, to that qualifica...
Name: text, dtype: object

In [5]:
# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()

# Print the first 5 rows of the text_clean column
print(speech_df['text_clean'].head())

0    fellow-citizens of the senate and of the house...
1    fellow citizens:  i am again called upon by th...
2    when it was first perceived, in early times, t...
3    friends and fellow-citizens:  called upon to u...
4    proceeding, fellow-citizens, to that qualifica...
Name: text_clean, dtype: object


In [8]:

# Find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()

# Count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

# Find the average length of word
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

# Print the first 5 rows of these columns
(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']].head())


Unnamed: 0,text_clean,char_cnt,word_cnt,avg_word_length
0,fellow-citizens of the senate and of the house...,8616,1427,6.037842
1,fellow citizens: i am again called upon by th...,787,135,5.82963
2,"when it was first perceived, in early times, t...",13871,2317,5.986621
3,friends and fellow-citizens: called upon to u...,10144,1717,5.907979
4,"proceeding, fellow-citizens, to that qualifica...",12902,2157,5.981456


In [28]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer
cv = CountVectorizer()

# Fit the vectorizer
cv.fit(speech_df['text_clean'])

# Print feature names
print(cv.get_feature_names_out()[:23]) # nethod no more available 

['0085' '0092' '0093' '0094' '0097' 'abandon' 'abandoned' 'abandonment'
 'abate' 'abdicated' 'abeyance' 'abhorring' 'abide' 'abiding' 'abilities'
 'ability' 'abject' 'able' 'ably' 'abnormal' 'abode' 'abolish' 'abolished']


In [15]:
# Import CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

# Instantiate CountVectorizer
cv = CountVectorizer()

# Fit and transform the vectorizer
cv_transformed = cv.fit_transform(speech_df['text_clean'])

# Print feature names
print(cv.vocabulary_)


{'fellow': 3257, 'citizens': 1297, 'of': 5508, 'the': 8060, 'senate': 7203, 'and': 381, 'house': 3959, 'representatives': 6765, 'among': 370, 'vicissitudes': 8700, 'incident': 4161, 'to': 8164, 'life': 4799, 'no': 5391, 'event': 2942, 'could': 1824, 'have': 3812, 'filled': 3284, 'me': 5045, 'with': 8944, 'greater': 3683, 'anxieties': 428, 'than': 8055, 'that': 8059, 'which': 8881, 'notification': 5420, 'was': 8806, 'transmitted': 8254, 'by': 1076, 'your': 9038, 'order': 5592, 'received': 6502, 'on': 5541, 'th': 8054, 'day': 1970, 'present': 6109, 'month': 5230, 'one': 5543, 'hand': 3754, 'summoned': 7827, 'my': 5298, 'country': 1839, 'whose': 8898, 'voice': 8750, 'can': 1105, 'never': 5378, 'hear': 3833, 'but': 1074, 'veneration': 8682, 'love': 4891, 'from': 3514, 'retreat': 6886, 'had': 3742, 'chosen': 1275, 'fondest': 3368, 'predilection': 6075, 'in': 4138, 'flattering': 3330, 'hopes': 3943, 'an': 376, 'immutable': 4056, 'decision': 2011, 'as': 568, 'asylum': 639, 'declining': 2022, 

In [19]:
#The vectorizer to you fit in the last exercise (cv) is available
#in your workspace.

# Apply the vectorizer
cv_transformed = cv.transform(speech_df['text_clean'])

# Print the full array
cv_array = cv_transformed.toarray()
print(cv_array)
cv_array.shape

[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 3 12  1 ...  0  0  0]
 [ 0 12  1 ...  0  0  0]
 [ 0 10  0 ...  0  0  0]]


(58, 9048)

In [20]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Specify arguments to limit the number of features generated
cv = CountVectorizer(min_df=0.2, max_df=0.8)

# Fit, transform, and convert into array
cv_transformed = cv.fit_transform(speech_df['text_clean'])
cv_array = cv_transformed.toarray()

# Print the array shape
print(cv_array.shape)

(58, 818)


In [23]:
#The numpy array (cv_array) and the vectorizer (cv) you fit in the

#last exercise are available in your workspace.

# Create a DataFrame with these features
cv_df = pd.DataFrame(cv_array,
                    columns=cv.get_feature_names_out()).add_prefix('Counts_')

# Add the new columns to the original DataFrame
speech_df_new = pd.concat([speech_df, cv_df], axis=1, sort=False)
(speech_df_new.head())

Unnamed: 0,Name,Inaugural Address,Date,text,text_clean,char_cnt,word_cnt,avg_word_length,Counts_abiding,Counts_ability,...,Counts_women,Counts_words,Counts_work,Counts_wrong,Counts_year,Counts_years,Counts_yet,Counts_you,Counts_young,Counts_your
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...,fellow-citizens of the senate and of the house...,8616,1427,6.037842,0,0,...,0,0,0,0,0,1,0,5,0,9
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...,fellow citizens: i am again called upon by th...,787,135,5.82963,0,0,...,0,0,0,0,0,0,0,0,0,1
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t...","when it was first perceived, in early times, t...",13871,2317,5.986621,0,0,...,0,0,0,0,2,3,0,0,0,1
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...,friends and fellow-citizens: called upon to u...,10144,1717,5.907979,0,0,...,0,0,1,2,0,0,2,7,0,7
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica...","proceeding, fellow-citizens, to that qualifica...",12902,2157,5.981456,0,0,...,0,0,0,0,2,2,2,4,0,4


In [32]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features=100, stop_words='english')

# Fit the vectorizer and transform the data
tv_transformed = tv.fit_transform(speech_df['text_clean'])

# Create a DataFrame with these features
tv_df = pd.DataFrame(tv_transformed.toarray(),
                    columns=tv.get_feature_names_out()).add_prefix('TFIDF_')
(tv_df.head())

Unnamed: 0,TFIDF_0092,TFIDF_0097,TFIDF_action,TFIDF_administration,TFIDF_america,TFIDF_american,TFIDF_americans,TFIDF_believe,TFIDF_best,TFIDF_better,...,TFIDF_things,TFIDF_time,TFIDF_today,TFIDF_union,TFIDF_united,TFIDF_war,TFIDF_way,TFIDF_work,TFIDF_world,TFIDF_years
0,0.0,0.047468,0.0,0.133265,0.0,0.105269,0.0,0.0,0.0,0.0,...,0.0,0.045877,0.0,0.135859,0.203364,0.0,0.060687,0.0,0.045877,0.052635
1,0.0,0.0,0.0,0.261016,0.266097,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.199157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.021955,0.0,0.092456,0.157092,0.073033,0.0,0.0,0.026118,0.060473,...,0.032037,0.021219,0.0,0.062837,0.070544,0.024344,0.0,0.0,0.063657,0.073033
3,0.0,0.131111,0.0,0.092023,0.0,0.0,0.0,0.090286,0.11698,0.045143,...,0.047831,0.0,0.0,0.093814,0.0,0.036346,0.0,0.038993,0.095038,0.0
4,0.0,0.028455,0.041523,0.039943,0.0,0.031552,0.0,0.0,0.067701,0.039189,...,0.083046,0.165008,0.0,0.122162,0.030477,0.094657,0.0,0.0,0.055003,0.063105


In [33]:
#The DataFrame from the last exercise (tv_df) is available in your
#workspace.

# Isolate the row to be examined
sample_row = tv_df.iloc[0]

# Print the top 5 words of the sorted output
print(sample_row.sort_values(ascending=False).head())


TFIDF_government    0.367016
TFIDF_public        0.332862
TFIDF_present       0.314827
TFIDF_duty          0.238368
TFIDF_country       0.229385
Name: 0, dtype: float64


In [37]:
#train_speech_df: The training set consisting of the first 45 speeches.
#test_speech_df: The test set consisting of the remaining speeches.
from sklearn.model_selection import train_test_split
train_speech_df , test_speech_df = train_test_split(speech_df, test_size=0.2, random_state=42) # 20% test set
# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features=100, stop_words='english')

# Fit the vectorizer and transform the data
tv_transformed = tv.fit_transform(train_speech_df['text_clean'])

# Transform test data
test_tv_transformed = tv.transform(test_speech_df['text_clean'])

# Create new features for the test set
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(),
                         columns=tv.get_feature_names_out()).add_prefix('TFIDF_')
(test_tv_df.head())


Unnamed: 0,TFIDF_0092,TFIDF_0097,TFIDF_administration,TFIDF_america,TFIDF_american,TFIDF_americans,TFIDF_believe,TFIDF_best,TFIDF_business,TFIDF_citizens,...,TFIDF_things,TFIDF_time,TFIDF_today,TFIDF_union,TFIDF_united,TFIDF_war,TFIDF_way,TFIDF_work,TFIDF_world,TFIDF_years
0,0.0,0.04784,0.130846,0.0,0.106628,0.0,0.0,0.0,0.0,0.234128,...,0.0,0.045835,0.0,0.140761,0.191359,0.0,0.060988,0.0,0.046826,0.053314
1,0.0,0.110395,0.0,0.0,0.0,0.0,0.0,0.192868,0.0,0.054027,...,0.0,0.052884,0.0,0.162409,0.165592,0.062881,0.0,0.0,0.108054,0.0
2,0.0,0.0,0.071805,0.200961,0.058515,0.0,0.023375,0.081541,0.065973,0.034262,...,0.025749,0.050306,0.0,0.0,0.035004,0.059816,0.044625,0.042637,0.222705,0.03901
3,0.0,0.143926,0.028118,0.0,0.068741,0.0,0.04119,0.035921,0.0,0.100625,...,0.030248,0.019699,0.0,0.090745,0.092524,0.023423,0.013106,0.012522,0.010062,0.045827
4,0.0,0.392213,0.0,0.120627,0.187325,0.048306,0.149662,0.065259,0.0,0.027421,...,0.0,0.107365,0.08672,0.370936,0.0,0.031915,0.142859,0.102372,0.191949,0.093663


In [40]:
#bigrams: Sequences of two consecutive words
#trigrams: Sequences of two consecutive words

#These can be automatically created in your dataset by specifying
#the ngram_range argument as a tuple (n1, n2) where all n-grams in
#the n1 to n2 range are included.

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate a trigram vectorizer
cv_trigram_vec = CountVectorizer(max_features=100,
                                stop_words='english',
                                ngram_range = (3,3))

# Fit and apply trigram vectorizer
cv_trigram = cv_trigram_vec.fit_transform(speech_df['text_clean'])

# Print the trigram features
print(cv_trigram_vec.get_feature_names_out()[:20])

['0092 ideal freedom' 'ability preserve protect'
 'agriculture commerce manufactures' 'america 0092 ideal'
 'best ability preserve' 'best interests country' 'bless god bless'
 'bless united states' 'chief justice mr' 'children children children'
 'citizens united states' 'civil religious liberty'
 'commerce united states' 'concern thank god' 'confidence fellow citizens'
 'congress extraordinary session' 'constitution does expressly'
 'constitution united states' 'coordinate branches government'
 'day task people']


In [42]:

# Create a DataFrame of the features
cv_tri_df = pd.DataFrame(cv_trigram.toarray(),
                        columns=cv_trigram_vec.get_feature_names_out()).add_prefix('Counts_')

# Print the top 5 words in the sorted output
print(cv_tri_df.sum().sort_values(ascending=False).head())


Counts_constitution united states    20
Counts_people united states          13
Counts_mr chief justice              10
Counts_preserve protect defend       10
Counts_president united states        8
dtype: int64
