In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/inaugural_speeches.csv')

In [4]:
df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


In [6]:
df['text'][0][:200]

'Fellow-Citizens of the Senate and of the House of Representatives:  AMONG the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was'

In [7]:
df['text_clean'] = df['text'].str.replace('[^a-zA-Z]', ' ')

In [9]:
df['text_clean'][0][:200]

'Fellow Citizens of the Senate and of the House of Representatives   AMONG the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was'

In [10]:
df['text_clean'] = df['text_clean'].str.lower()

In [11]:
df['text_clean'][0][:200]

'fellow citizens of the senate and of the house of representatives   among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was'

In [12]:
# replace('[\s+]',' ')

In [13]:
df['char_count'] = df['text_clean'].str.len()
df['word_count'] = df['text_clean'].str.split().str.len()
df['avg_word_length'] = df['char_count'] / df['word_count']

In [14]:
df[['text_clean', 'char_count', 'word_count', 'avg_word_length']].head()

Unnamed: 0,text_clean,char_count,word_count,avg_word_length
0,fellow citizens of the senate and of the house...,8616,1432,6.01676
1,fellow citizens i am again called upon by th...,787,135,5.82963
2,when it was first perceived in early times t...,13871,2323,5.971158
3,friends and fellow citizens called upon to u...,10144,1736,5.843318
4,proceeding fellow citizens to that qualifica...,12902,2169,5.948363


## CountVectorizer

In [15]:
# sử dụng thư viện CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# khởi tạo
cv = CountVectorizer()

In [17]:
# cv.fit(df['text_clean'])
# cv_transformed =  cv.transform(df['text_clean'])

cv_transformed = cv.fit_transform(df['text_clean'])

In [19]:
print(cv.get_feature_names())

['abandon', 'abandoned', 'abandonment', 'abate', 'abdicated', 'abeyance', 'abhorring', 'abide', 'abiding', 'abilities', 'ability', 'abject', 'able', 'ably', 'abnormal', 'abode', 'abolish', 'abolished', 'abolishing', 'aboriginal', 'aborigines', 'abound', 'abounding', 'abounds', 'about', 'above', 'abraham', 'abreast', 'abridging', 'abroad', 'absence', 'absent', 'absolute', 'absolutely', 'absolutism', 'absorb', 'absorbed', 'absorbing', 'absorbs', 'abstain', 'abstaining', 'abstract', 'abstractions', 'absurd', 'abundance', 'abundant', 'abundantly', 'abuse', 'abused', 'abuses', 'academies', 'accept', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accessible', 'accession', 'accident', 'accidental', 'accidents', 'acclaim', 'accommodation', 'accommodations', 'accompanied', 'accompany', 'accomplish', 'accomplished', 'accomplishing', 'accomplishment', 'accomplishments', 'accord', 'accordance', 'accorded', 'according', 'accordingly', 'accords', 'account', 'accountability', 'accountab

In [21]:
cv_transformed = cv_transformed.toarray()

In [22]:
cv_transformed

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
cv_transformed.shape

(58, 9043)

In [24]:
cv_df = pd.DataFrame(cv_transformed, columns=cv.get_feature_names()).add_prefix('cv_')
cv_df.head()

Unnamed: 0,cv_abandon,cv_abandoned,cv_abandonment,cv_abate,cv_abdicated,cv_abeyance,cv_abhorring,cv_abide,cv_abiding,cv_abilities,...,cv_your,cv_yours,cv_yourself,cv_yourselves,cv_youth,cv_youthful,cv_zeal,cv_zealous,cv_zealously,cv_zone
0,0,0,0,0,0,0,0,0,0,0,...,9,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,3,0,0,0


## CountVectorizer với min_df và max_df

In [26]:
cv2 = CountVectorizer(min_df=0.2, max_df=0.8)

In [27]:
cv_transformed2 = cv2.fit_transform(df['text_clean'])

In [28]:
cv_transformed2 = cv_transformed2.toarray()

In [29]:
cv_transformed2.shape

(58, 818)

In [30]:
print(cv2.get_feature_names())

['abiding', 'ability', 'able', 'about', 'above', 'abroad', 'accept', 'accomplished', 'achieve', 'across', 'act', 'action', 'acts', 'add', 'adequate', 'administration', 'adopted', 'advance', 'advantage', 'affairs', 'afford', 'after', 'again', 'against', 'age', 'ago', 'agriculture', 'aid', 'alike', 'almighty', 'almost', 'alone', 'along', 'already', 'also', 'always', 'am', 'america', 'american', 'americans', 'among', 'ancient', 'another', 'appear', 'armed', 'arms', 'around', 'ask', 'assume', 'attempt', 'attention', 'authority', 'avoid', 'away', 'back', 'balance', 'basis', 'bear', 'because', 'become', 'beginning', 'being', 'belief', 'believe', 'believed', 'belongs', 'benefit', 'benefits', 'best', 'better', 'between', 'beyond', 'birth', 'bless', 'blessings', 'blood', 'body', 'bonds', 'born', 'both', 'bound', 'branches', 'bring', 'brought', 'build', 'burden', 'burdens', 'business', 'call', 'called', 'came', 'cannot', 'capacity', 'capital', 'care', 'carry', 'cause', 'centuries', 'century', 'c

In [31]:
cv_df2 = pd.DataFrame(cv_transformed2, columns=cv2.get_feature_names()).add_prefix('cv_')
cv_df2.head()

Unnamed: 0,cv_abiding,cv_ability,cv_able,cv_about,cv_above,cv_abroad,cv_accept,cv_accomplished,cv_achieve,cv_across,...,cv_women,cv_words,cv_work,cv_wrong,cv_year,cv_years,cv_yet,cv_you,cv_young,cv_your
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,5,0,9
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,2,3,0,0,0,1
3,0,0,0,1,1,1,0,0,0,0,...,0,0,1,2,0,0,2,7,0,7
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,4,0,4


In [32]:
df_new = pd.concat([df, cv_df2], axis=1)
df_new.shape

(58, 826)

## TF-IDF

In [33]:
#khai bao thư viện
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tf_idf = TfidfVectorizer(max_features=200, stop_words='english')

In [37]:
f = open('data/vietnamese-stopwords.txt', 'r', encoding='utf-8')

In [38]:
contents = f.read()

In [40]:
my_stopwords = contents.split('\n')
my_stopwords

['a lô',
 'a ha',
 'ai',
 'ai ai',
 'ai nấy',
 'ai đó',
 'alô',
 'amen',
 'anh',
 'anh ấy',
 'ba',
 'ba ba',
 'ba bản',
 'ba cùng',
 'ba họ',
 'ba ngày',
 'ba ngôi',
 'ba tăng',
 'bao giờ',
 'bao lâu',
 'bao nhiêu',
 'bao nả',
 'bay biến',
 'biết',
 'biết bao',
 'biết bao nhiêu',
 'biết chắc',
 'biết chừng nào',
 'biết mình',
 'biết mấy',
 'biết thế',
 'biết trước',
 'biết việc',
 'biết đâu',
 'biết đâu chừng',
 'biết đâu đấy',
 'biết được',
 'buổi',
 'buổi làm',
 'buổi mới',
 'buổi ngày',
 'buổi sớm',
 'bà',
 'bà ấy',
 'bài',
 'bài bác',
 'bài bỏ',
 'bài cái',
 'bác',
 'bán',
 'bán cấp',
 'bán dạ',
 'bán thế',
 'bây bẩy',
 'bây chừ',
 'bây giờ',
 'bây nhiêu',
 'bèn',
 'béng',
 'bên',
 'bên bị',
 'bên có',
 'bên cạnh',
 'bông',
 'bước',
 'bước khỏi',
 'bước tới',
 'bước đi',
 'bạn',
 'bản',
 'bản bộ',
 'bản riêng',
 'bản thân',
 'bản ý',
 'bất chợt',
 'bất cứ',
 'bất giác',
 'bất kì',
 'bất kể',
 'bất kỳ',
 'bất luận',
 'bất ngờ',
 'bất nhược',
 'bất quá',
 'bất quá chỉ',
 'bất thình l

In [41]:
my_stopwords = my_stopwords.append('a há')

In [42]:
#tf_idf = TfidfVectorizer(max_features=200, stop_words=my_stopwords)

In [48]:
tf_idf = TfidfVectorizer(max_features=100, stop_words='english')

In [49]:
tf_transformed = tf_idf.fit_transform(df['text_clean'])

In [50]:
print(tf_idf.get_feature_names())

['action', 'administration', 'america', 'american', 'americans', 'believe', 'best', 'better', 'change', 'citizens', 'come', 'common', 'confidence', 'congress', 'constitution', 'country', 'day', 'duties', 'duty', 'equal', 'executive', 'faith', 'far', 'federal', 'fellow', 'force', 'foreign', 'free', 'freedom', 'future', 'general', 'god', 'good', 'government', 'great', 'high', 'history', 'home', 'hope', 'human', 'institutions', 'interests', 'just', 'justice', 'know', 'land', 'law', 'laws', 'let', 'liberty', 'life', 'long', 'make', 'man', 'means', 'men', 'nation', 'national', 'nations', 'necessary', 'need', 'new', 'office', 'old', 'order', 'party', 'peace', 'people', 'place', 'policy', 'political', 'power', 'powers', 'present', 'president', 'principles', 'progress', 'prosperity', 'public', 'purpose', 'right', 'rights', 'secure', 'service', 'shall', 'spirit', 'state', 'states', 'strength', 'support', 'things', 'time', 'today', 'union', 'united', 'war', 'way', 'work', 'world', 'years']


In [51]:
tf_transformed = tf_transformed.toarray()
tf_transformed

array([[0.        , 0.13341519, 0.        , ..., 0.        , 0.04592879,
        0.05269404],
       [0.        , 0.2610165 , 0.26609667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.09243595, 0.1570584 , ..., 0.        , 0.06364299,
        0.07301753],
       ...,
       [0.03761807, 0.        , 0.36891392, ..., 0.18400376, 0.17440593,
        0.0285851 ],
       [0.03897977, 0.        , 0.30581431, ..., 0.12710956, 0.07745104,
        0.14809914],
       [0.06921321, 0.03329018, 0.67876212, ..., 0.02821227, 0.13752353,
        0.05259353]])

In [52]:
tf_df = pd.DataFrame(tf_transformed, columns=tf_idf.get_feature_names()).add_prefix('tf_')
tf_df.head()

Unnamed: 0,tf_action,tf_administration,tf_america,tf_american,tf_americans,tf_believe,tf_best,tf_better,tf_change,tf_citizens,...,tf_things,tf_time,tf_today,tf_union,tf_united,tf_war,tf_way,tf_work,tf_world,tf_years
0,0.0,0.133415,0.0,0.105388,0.0,0.0,0.0,0.0,0.0,0.229644,...,0.0,0.045929,0.0,0.136012,0.203593,0.0,0.060755,0.0,0.045929,0.052694
1,0.0,0.261016,0.266097,0.0,0.0,0.0,0.0,0.0,0.0,0.179712,...,0.0,0.0,0.0,0.0,0.199157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.092436,0.157058,0.073018,0.0,0.0,0.026112,0.06046,0.0,0.106072,...,0.03203,0.021214,0.0,0.062823,0.070529,0.024339,0.0,0.0,0.063643,0.073018
3,0.0,0.092693,0.0,0.0,0.0,0.090942,0.117831,0.045471,0.053335,0.223369,...,0.048179,0.0,0.0,0.094497,0.0,0.03661,0.0,0.039277,0.095729,0.0
4,0.041334,0.039761,0.0,0.031408,0.0,0.0,0.067393,0.039011,0.091514,0.27376,...,0.082667,0.164256,0.0,0.121605,0.030338,0.094225,0.0,0.0,0.054752,0.062817


In [53]:
tf_df.iloc[0].sort_values(ascending=False).head(10)

tf_government    0.367430
tf_public        0.333237
tf_present       0.315182
tf_duty          0.238637
tf_citizens      0.229644
tf_country       0.229644
tf_united        0.203593
tf_far           0.178978
tf_people        0.174590
tf_good          0.147528
Name: 0, dtype: float64

In [54]:
df_new2 = pd.concat([df, tf_df], axis=1)
df_new2.shape

(58, 108)

## TF_IDF với ngram_range

In [55]:
tf_idf2 = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(2,2))

In [56]:
tf_transformed2 = tf_idf2.fit_transform(df['text_clean'])

In [57]:
print(tf_idf2.get_feature_names())

['administration government', 'almighty god', 'american people', 'beloved country', 'best ability', 'best interests', 'blessings liberty', 'body people', 'branch government', 'branches government', 'carry effect', 'chief justice', 'chief magistrate', 'children children', 'civil religious', 'civil service', 'civil war', 'common country', 'common good', 'constitution laws', 'constitution united', 'declaration independence', 'defend constitution', 'discharge duties', 'executive department', 'federal government', 'fellow americans', 'fellow citizens', 'foreign affairs', 'foreign nations', 'form government', 'free government', 'free men', 'free nations', 'free people', 'future generations', 'general government', 'god bless', 'good faith', 'good government', 'government people', 'government shall', 'government union', 'great nation', 'great people', 'half century', 'human dignity', 'human freedom', 'know america', 'lasting peace', 'law abiding', 'men women', 'mr chief', 'national government'

In [60]:
tf_transformed2 = tf_transformed2.toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [61]:
type(tf_transformed2)

numpy.ndarray

In [62]:
tf_df2 = pd.DataFrame(tf_transformed2, columns=tf_idf2.get_feature_names()).add_prefix('tf2_')
tf_df2.head()

Unnamed: 0,tf2_administration government,tf2_almighty god,tf2_american people,tf2_beloved country,tf2_best ability,tf2_best interests,tf2_blessings liberty,tf2_body people,tf2_branch government,tf2_branches government,...,tf2_thank god,tf2_time come,tf2_time history,tf2_time peace,tf2_union states,tf2_united nations,tf2_united states,tf2_vice president,tf2_world peace,tf2_years ago
0,0.0,0.0,0.295828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.222467,0.0,0.0,0.0
1,0.446613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.267432,0.0,0.0,0.0,0.164397,0.164397,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.201113,0.0,0.0,0.0
3,0.0,0.0,0.0,0.251205,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.207089,0.0,0.0,0.092549,0.0,0.0,0.0


In [63]:
tf_df2.iloc[0].sort_values(ascending=False).head(10)

tf2_executive department    0.612882
tf2_free government         0.478238
tf2_fellow citizens         0.322115
tf2_american people         0.295828
tf2_public private          0.259983
tf2_united states           0.222467
tf2_form government         0.215236
tf2_people united           0.202439
tf2_years ago               0.000000
tf2_free people             0.000000
Name: 0, dtype: float64

In [64]:
tf_df2.sum().sort_values(ascending=False).head(10)

tf2_united states         11.193142
tf2_fellow citizens        8.648092
tf2_men women              4.695099
tf2_american people        4.413159
tf2_federal government     4.081120
tf2_years ago              4.007657
tf2_self government        3.607154
tf2_general government     3.119509
tf2_god bless              2.960714
tf2_vice president         2.957044
dtype: float64