In [1]:
import os
import pandas as pd
import numpy as np

from nltk.corpus import stopwords # Import the stop word list
pd.set_option('display.max_colwidth', None)

In [2]:
stop_words = stopwords.words('english')

<br>
Read the final split csv...

In [3]:
df = pd.read_csv('../Splitting/final-update/final-splits/final_splits.csv', index_col = 0)

In [4]:
df['year'] = df.index.str.split("_").str[0]

In [5]:
df.head()

Unnamed: 0_level_0,law_type,state,sentence,length,start_page,end_page,act,section,path,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1873-1874_0000,Act,SOUTH CAROLINA,"AN ACT TO REPEAT SECTION FOUR (4) OR AN ACT ENTITLED “AN ACT TO RELIEVE THE STATE OF SOUTH CAROLINA OF ALL LIABILITY FOR ITS GUARANTY OF THE BONPS OF THE BLUE RIPCE RAILROAP COMPANY, BY PROVIDING FOR THE SECURING AND DESTRUCTION OF THE SAME,” APPROVED MARCU 2, 1872; AND TO REPEAL SO MUCH OF SECTION 72, CHAPRER XII, TITLE III, OR THE GENERAL STATUTES, AS AUTHORIZES THE STATE AUDITOR TO GIVE NOTICE ANNUALLY TO EACU COUNTY AVUPITOR OF THE RATES PER CENTUM TO BE LEVIED FOR VARIOUS SRATE PURPPOSES.",503,31,31,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1873-1874/images.zip/00000031.jpg,1873-1874
1873-1874_0001,Act,SOUTH CAROLINA,"| Section 1. Be it enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That Section four (4) of an Act entitled “An Act to relieve the State of South Carolina of all liability for its guaranty of the bonds of the Blue Ridge Railroad Company, by providing for the securing and destruction of the same, approved March 2, 1872, providing for an annual tax of three mills on the dollar, for the redemption of the revenue bond scrip, be, and the same is hereby, repealed.",577,31,32,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1873-1874/images.zip/00000031.jpg,1873-1874
1873-1874_0002,Act,SOUTH CAROLINA,"Sec. 2. That so much of Section seventytwo (72), Chapter XII, Title III, as directs that the State Auditor shall, on or before November fifteenth, annually, give notice to each County Auditor of the rates per centum authorized by law to be levied for various State purposes, be, and the same is hereby, repealed, and the Comptroller General is hereby forbidden to levy any tax for any purpose whatever, unless expressly hereafter authorized so to do by statute. Approved October 22, 1873.",493,32,32,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1873-1874/images.zip/00000032.jpg,1873-1874
1873-1874_0003,Act,SOUTH CAROLINA,"AN ACT to Revive, RENEw AND AMEND AN ACT ENTITLED “An Acr to Incorporate THE Home InsurANcE ComPANY, OF CHARLESTON.” Be it enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the Act of the General Assembly of the said State, entitled “ An Act to incorporate the Home Insurance Company, of Charleston,” be, and the same is hereby, revived, renewed and extended, from the passage hereof, with the following alterations and amendments, viz: Section 5, strike out the words “one year” and insert in lieu thereof the words “ two years.” Section 8, after the word “ respondentia,” insert the words “or other securities.” Section 14, strike out the words “one year” and insert in lieu thereof “ two years.”",817,32,32,2,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1873-1874/images.zip/00000032.jpg,1873-1874
1873-1874_0004,Act,SOUTH CAROLINA,"AN ACT To IncorPorRaTE THE REFORM Apotto Soclery, OF CHARLESTON, SouTH CAROLINA.",85,32,32,3,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1873-1874/images.zip/00000032.jpg,1873-1874


<br>

Make a dataframe that contains most frequent words (along with their counts) and the top tf-idf words (along with their scores) for each year.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit the vectorizer on all sentences
tfidf_vectorizer.fit(df['sentence'])

# Transform the sentences into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.transform(df['sentence'])

# Define the number of top terms to retrieve
top_n = 5

# Create an empty list to store dictionaries for each year's data
data_to_concat = []

# Group the DataFrame by 'year'
grouped = df.groupby('year')

# Iterate through each group (year)
for year, group in grouped:
    print('Working on', year)
    
    # Concatenate all sentences for the current year into one document
    year_text = ' '.join(group['sentence'])
    
    # Transform the concatenated text into a TF-IDF vector
    year_tfidf_vector = tfidf_vectorizer.transform([year_text])
    
    # Get the TF-IDF scores for the year
    year_tfidf_scores = year_tfidf_vector.toarray()[0]
    
    # Get the indices of the top N terms for the year
    ## row.argsort() calculates the indices that would sort the elements in ascending order
    ## [-top_n:] takes the last top_n elements from the sorted indices array
    ## [::-1] reverses the order of the selected indices to make the indices in descending order.
    top_indices = year_tfidf_scores.argsort()[-top_n:][::-1]
    
    # Get the top terms and their TF-IDF scores
    top_terms = [tfidf_vectorizer.get_feature_names_out()[i] for i in top_indices]
    top_scores = [year_tfidf_scores[i] for i in top_indices]
    
    # Round top_scores to 3 decimal places
    top_scores = list(np.around(np.array(top_scores),3))

    # Get the top n frequent words
    words_lower = pd.Series(year_text.lower().split())
    top_frequent = dict(words_lower[~words_lower.isin(stop_words)].value_counts()[:top_n])
    
    # Create a dictionary for the current year's data
    year_data = {'year': year, 'top frequent': top_frequent, 'top tf-idf': dict(zip(top_terms, top_scores))}
    
    # Append the dictionary to the list
    data_to_concat.append(year_data)
    
# Concatenate all dictionaries into a single DataFrame
df_analysis = pd.DataFrame(data_to_concat)
df_analysis.set_index('year', inplace=True)

Working on 1873-1874
Working on 1892
Working on 1893
Working on 1894
Working on 1901
Working on 1918
Working on 1921
Working on 1928
Working on 1948
Working on 1956


In [7]:
df_analysis

Unnamed: 0_level_0,top frequent,top tf-idf
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1873-1874,"{'shall': 2705, 'said': 1549, 'may': 807, 'county': 741, 'act': 708}","{'shall': 0.334, 'said': 0.239, 'may': 0.177, 'sec': 0.174, 'state': 0.165}"
1892,"{'said': 2682, 'shall': 2486, 'act': 1071, 'county': 1041, 'state': 765}","{'said': 0.382, 'shall': 0.278, 'county': 0.191, 'act': 0.19, 'state': 0.163}"
1893,"{'shall': 2068, 'said': 1963, 'county': 1036, 'act': 875, 'state': 653}","{'said': 0.352, 'shall': 0.292, 'county': 0.224, 'act': 0.192, 'dollars': 0.181}"
1894,"{'shall': 3066, 'said': 2532, 'county': 1000, 'act': 919, 'may': 771}","{'said': 0.349, 'shall': 0.331, 'town': 0.18, 'sec': 0.177, 'county': 0.175}"
1901,"{'shall': 1741, 'said': 1474, 'county': 940, 'act': 715, 'section': 536}","{'said': 0.32, 'shall': 0.291, 'county': 0.246, 'dollars': 0.244, 'act': 0.188}"
1918,"{'shall': 2615, 'said': 2187, 'county': 1897, '00': 1887, 'act': 1144}","{'00': 0.41, 'county': 0.289, 'said': 0.257, 'shall': 0.239, 'item': 0.181}"
1921,"{'shall': 4414, 'said': 3949, 'county': 3823, 'act': 1754, 'school': 1194}","{'00': 0.352, 'county': 0.35, 'said': 0.287, 'shall': 0.248, 'act': 0.153}"
1928,"{'shall': 4690, 'county': 3820, 'said': 3347, 'act': 1699, 'school': 1419}","{'00': 0.491, 'county': 0.299, 'shall': 0.227, 'said': 0.21, 'school': 0.141}"
1948,"{'shall': 3523, 'said': 1928, 'section': 1910, 'county': 1188, 'act': 877}","{'shall': 0.332, 'section': 0.294, 'said': 0.237, 'county': 0.211, 'act': 0.14}"
1956,"{'shall': 3274, 'section': 2787, 'state': 1216, 'south': 1103, 'act': 1054}","{'section': 0.377, 'shall': 0.278, '00': 0.261, '1952': 0.203, 'state': 0.178}"


<br>