In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/inaugural_speeches_clean.csv')

In [3]:
df[['text_clean', 'char_count', 'word_count', 'avg_word_length']].head()

Unnamed: 0,text_clean,char_count,word_count,avg_word_length
0,fellow citizens of the senate and of the house...,8503,1432,5.937849
1,fellow citizens i am again called upon by the ...,775,135,5.740741
2,when it was first perceived in early times tha...,13605,2323,5.856651
3,friends and fellow citizens called upon to und...,9931,1736,5.720622
4,proceeding fellow citizens to that qualificati...,12680,2169,5.846012


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# khởi tạo tfidf
tf_idf = TfidfVectorizer()

In [6]:
# phân rã văn bản
df_transform = tf_idf.fit_transform(df['text_clean']).toarray()
df_transform

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01074176, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.01191863, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [7]:
print(tf_idf.get_feature_names())

['abandon', 'abandoned', 'abandonment', 'abate', 'abdicated', 'abeyance', 'abhorring', 'abide', 'abiding', 'abilities', 'ability', 'abject', 'able', 'ably', 'abnormal', 'abode', 'abolish', 'abolished', 'abolishing', 'aboriginal', 'aborigines', 'abound', 'abounding', 'abounds', 'about', 'above', 'abraham', 'abreast', 'abridging', 'abroad', 'absence', 'absent', 'absolute', 'absolutely', 'absolutism', 'absorb', 'absorbed', 'absorbing', 'absorbs', 'abstain', 'abstaining', 'abstract', 'abstractions', 'absurd', 'abundance', 'abundant', 'abundantly', 'abuse', 'abused', 'abuses', 'academies', 'accept', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accessible', 'accession', 'accident', 'accidental', 'accidents', 'acclaim', 'accommodation', 'accommodations', 'accompanied', 'accompany', 'accomplish', 'accomplished', 'accomplishing', 'accomplishment', 'accomplishments', 'accord', 'accordance', 'accorded', 'according', 'accordingly', 'accords', 'account', 'accountability', 'accountab

In [8]:
len(tf_idf.get_feature_names())

9043

In [10]:
# chuyển về DataFrame
df_transform = pd.DataFrame(df_transform, columns=tf_idf.get_feature_names()).add_prefix('tf_')
df_transform.head()

Unnamed: 0,tf_abandon,tf_abandoned,tf_abandonment,tf_abate,tf_abdicated,tf_abeyance,tf_abhorring,tf_abide,tf_abiding,tf_abilities,...,tf_your,tf_yours,tf_yourself,tf_yourselves,tf_youth,tf_youthful,tf_zeal,tf_zealous,tf_zealously,tf_zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.076056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.010742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005269,0.0,0.0,0.0,0.0,0.0,0.009416,0.0,0.0,0.0
3,0.013103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049308,0.0,0.0,0.0,0.0,0.0,0.012588,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.024301,0.0,0.0,0.0,0.0,0.0,0.03257,0.0,0.0,0.0


In [11]:
df_new = pd.concat([df, df_transform], axis=1)
df_new.drop(['text','text_clean'], axis=1, inplace=True)
df_new.head()

Unnamed: 0,Name,Inaugural Address,Date,char_count,word_count,avg_word_length,tf_abandon,tf_abandoned,tf_abandonment,tf_abate,...,tf_your,tf_yours,tf_yourself,tf_yourselves,tf_youth,tf_youthful,tf_zeal,tf_zealous,tf_zealously,tf_zone
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",8503,1432,5.937849,0.0,0.0,0.0,0.0,...,0.076056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",775,135,5.740741,0.0,0.0,0.0,0.0,...,0.059043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,John Adams,Inaugural Address,"Saturday, March 4, 1797",13605,2323,5.856651,0.0,0.010742,0.0,0.0,...,0.005269,0.0,0.0,0.0,0.0,0.0,0.009416,0.0,0.0,0.0
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",9931,1736,5.720622,0.013103,0.0,0.0,0.0,...,0.049308,0.0,0.0,0.0,0.0,0.0,0.012588,0.0,0.0,0.0
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805",12680,2169,5.846012,0.0,0.0,0.0,0.0,...,0.024301,0.0,0.0,0.0,0.0,0.0,0.03257,0.0,0.0,0.0


## bổ sung min_df và max_df

In [12]:
# khởi tạo tfidf
tf_idf2 = TfidfVectorizer(min_df=0.2, max_df=0.8)

In [13]:
# phân rã văn bản
df_transform2 = tf_idf2.fit_transform(df['text_clean']).toarray()
df_transform2

array([[0.        , 0.        , 0.        , ..., 0.15118555, 0.        ,
        0.31020755],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.14233171],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.02426728],
       ...,
       [0.        , 0.0294054 , 0.        , ..., 0.26614324, 0.03017387,
        0.06500971],
       [0.        , 0.        , 0.        , ..., 0.10314077, 0.03274191,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.28303419, 0.03208887,
        0.25349718]])

In [14]:
len(tf_idf2.get_feature_names())

818

In [15]:
# chuyển về DataFrame
df_transform2 = pd.DataFrame(df_transform2, columns=tf_idf2.get_feature_names()).add_prefix('tf_')
df_transform2.head()

Unnamed: 0,tf_abiding,tf_ability,tf_able,tf_about,tf_above,tf_abroad,tf_accept,tf_accomplished,tf_achieve,tf_across,...,tf_women,tf_words,tf_work,tf_wrong,tf_year,tf_years,tf_yet,tf_you,tf_young,tf_your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05067,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026707,0.0,0.151186,0.0,0.310208
1,0.0,0.0,0.0,0.139614,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142332
2,0.0,0.0,0.0,0.0,0.0,0.032116,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058479,0.05641,0.0,0.0,0.0,0.024267
3,0.0,0.0,0.0,0.026902,0.036296,0.036296,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022799,0.069187,0.0,0.0,0.051802,0.168419,0.0,0.191982
4,0.0,0.0,0.035184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.057674,0.037089,0.045205,0.083983,0.0,0.095733


In [16]:
df_new2 = pd.concat([df, df_transform2], axis=1)
df_new2.drop(['text','text_clean'], axis=1, inplace=True)
df_new2.head()

Unnamed: 0,Name,Inaugural Address,Date,char_count,word_count,avg_word_length,tf_abiding,tf_ability,tf_able,tf_about,...,tf_women,tf_words,tf_work,tf_wrong,tf_year,tf_years,tf_yet,tf_you,tf_young,tf_your
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",8503,1432,5.937849,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026707,0.0,0.151186,0.0,0.310208
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",775,135,5.740741,0.0,0.0,0.0,0.139614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142332
2,John Adams,Inaugural Address,"Saturday, March 4, 1797",13605,2323,5.856651,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058479,0.05641,0.0,0.0,0.0,0.024267
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",9931,1736,5.720622,0.0,0.0,0.0,0.026902,...,0.0,0.0,0.022799,0.069187,0.0,0.0,0.051802,0.168419,0.0,0.191982
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805",12680,2169,5.846012,0.0,0.0,0.035184,0.0,...,0.0,0.0,0.0,0.0,0.057674,0.037089,0.045205,0.083983,0.0,0.095733


In [17]:
df_transform2.iloc[0].sort_values(ascending=False)

tf_your        0.310208
tf_me          0.213656
tf_myself      0.178073
tf_ought       0.169920
tf_public      0.168895
                 ...   
tf_officers    0.000000
tf_office      0.000000
tf_offer       0.000000
tf_off         0.000000
tf_abiding     0.000000
Name: 0, Length: 818, dtype: float64

In [18]:
df_transform2.sum().sort_values(ascending=False)

tf_should         4.998478
tf_america        4.719730
tf_states         4.509869
tf_peace          4.151948
tf_public         4.055186
                    ...   
tf_recognize      0.377615
tf_stronger       0.376334
tf_disposition    0.375929
tf_enjoyed        0.370392
tf_declare        0.364118
Length: 818, dtype: float64