In [7]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import re

legislation_data = pd.read_csv("legislation_data.csv", encoding="ISO-8859-1")
q2 = pd.read_csv("q2_work_together.csv", encoding="ISO-8859-1")

print(legislation_data.head())
print(q2.head())

                                           file_path bill_number  \
0  /Users/johanchua/Desktop/Stats 141XP Desktop/d...      H.4147   
1  /Users/johanchua/Desktop/Stats 141XP Desktop/d...      S.1263   
2  /Users/johanchua/Desktop/Stats 141XP Desktop/d...      H.3682   
3  /Users/johanchua/Desktop/Stats 141XP Desktop/d...     SD.3464   
4  /Users/johanchua/Desktop/Stats 141XP Desktop/d...      H.2990   

                                          bill_title  \
0  An Act providing technical education and econo...   
1                  An Act relative to recovery homes   
2               An Act amending the Medal of Liberty   
3      DVS Veteran Advisory Committee Interim Report   
4        An Act to encourage Massachusetts residency   

                                     bill_url           sponsor_name  \
0   https://malegislature.gov/Bills/192/H4147  Harrington, Sheila C.   
1   https://malegislature.gov/Bills/192/S1263          Collins, Nick   
2   https://malegislature.gov/Bills/19

In [9]:
df = legislation_data[['bill_number', 'bill_title', "sponsor_name", "cosponsors", "num_cosponsors", "history"]]
df = df.dropna(subset=['sponsor_name', 'cosponsors'])
df = df.replace({", Jr.": " Jr."}, regex=True)
df = df.replace({", III": " III"}, regex=True)
df = df.replace({", II": " II"}, regex=True)

In [11]:
# Unify sponsor_name format from "Last, First Middle" to "First Middle Last"
def fix_sponsor_name(name):
    parts = [p.strip() for p in name.split(",")]
    if len(parts) == 2:  # Last, First Middle
        return f"{parts[1]} {parts[0]}"
    return name

df["sponsor_name"] = df["sponsor_name"].apply(fix_sponsor_name)

# Remove stop words in bill_title
def extract_keywords(title):
    stop_words = {"relative", "authorizing", "authority", "from", "commission", "massachusetts",
                  "establishing", "certain", "providing", "with"}
    words = re.findall(r'\b[a-zA-Z]+\b', str(title).lower())
    return [w for w in words if len(w) > 3 and w not in stop_words]

# Split names of cosponsors
def parse_cosponsors(cosponsors):
    if pd.isna(cosponsors):  # remove NaN
        return []
    names = [name.strip() for name in cosponsors.split(",")]
    return names

# Text analysis for each legislator
# Count the keywords of sponsor and cosponsor participating bills
sponsor_keywords = {}

for _, row in df.iterrows():
    sponsor = row["sponsor_name"]
    cosponsors = parse_cosponsors(row["cosponsors"])
    keywords = extract_keywords(row["bill_title"])

    # sponsor's key words
    if sponsor not in sponsor_keywords:
        sponsor_keywords[sponsor] = Counter()
    sponsor_keywords[sponsor].update(keywords)

    # cosponsor's key words
    for cosponsor in cosponsors:
        if cosponsor not in sponsor_keywords:
            sponsor_keywords[cosponsor] = Counter()
        sponsor_keywords[cosponsor].update(keywords)

# Turn into DataFrame, select each sponsor/cosponsor's top 10 key words
sponsor_keywords_df = pd.DataFrame([
    {"sponsor_name": sponsor, **{f"keyword_{i+1}": kw[0] for i, kw in enumerate(keywords.most_common(10))}}
    for sponsor, keywords in sponsor_keywords.items()
])

# result
print(sponsor_keywords_df.head())

import csv

csv_filename = r"C:\Users\admin\Desktop\sponsor_keywords_expanded.csv"
sponsor_keywords_df.to_csv(csv_filename, index=False, sep=",", quoting=csv.QUOTE_MINIMAL, encoding='utf-8')

           sponsor_name keyword_1 keyword_2     keyword_3     keyword_4  \
0  Sheila C. Harrington    public  children  commonwealth     insurance   
1   Lindsay N. Sabadosa    health    public        access          care   
2    Mathew J. Muratore      care    health        public  commonwealth   
3      Michael J. Soter    public    safety      veterans       service   
4       Brian W. Murray    health    public          care     education   

      keyword_5     keyword_6   keyword_7  keyword_8   keyword_9    keyword_10  
0       housing      veterans  protecting     school       state     education  
1  commonwealth     education       covid   services     promote        safety  
2        access     education      safety   services     program         state  
3  commonwealth        school    benefits  education  protecting     emergency  
4        access  commonwealth    services  insurance     program  disabilities  


In [13]:
# Turn into Python dictionary
group_text = """
Group 2 (size 16): ['Jason M. Lewis', 'Kay Khan', 'Denise Provost', 'Marjorie C. Decker', 'Ruth B. Balser', 'Jonathan Hecht', 'Jennifer E. Benson', 'Kenneth I. Gordon', 'Carolyn C. Dykema', 'Louis L. Kafka', 'Lori A. Ehrlich', 'Frank I. Smizik', 'Peter V. Kocot', 'Sarah K. Peake', 'John W. Scibak', 'Kate Hogan']

Group 1 (size 10): ['James B. Eldridge', 'Sal N. DiDomenico', 'Patricia D. Jehlen', 'Thomas M. Stanley', 'Michael O. Moore', "Patrick M. O'Connor", 'Michael J. Barrett', 'Walter F. Timilty', 'Anne M. Gobi', 'Brendan P. Crighton']

Group 3 (size 22): ['Lindsay N. Sabadosa', 'Natalie M. Higgins', 'Jack Patrick Lewis', 'Mike Connolly', 'Rebecca L. Rausch', 'Tommy Vitolo', 'Carmine Lawrence Gentile', 'James K. Hawkins', 'David Henry Argosky LeBoeuf', 'Carlos González', 'Joanne M. Comerford', 'Tram T. Nguyen', 'Brian W. Murray', 'Adrian C. Madaro', 'Maria Duaime Robinson', 'Michelle L. Ciccolo', 'Tami L. Gouveia', 'Mindy Domb', 'Peter Capano', 'David Allen Robertson', 'Natalie M. Blais', 'Marcos A. Devers']

Group 4 (size 2): ['Mary S. Keefe', 'Julian Cyr']

Group 5 (size 2): ['David M. Rogers', 'Elizabeth A. Malia']

Group 6 (size 1): ['Sean Garballey']

Group 7 (size 16): ['Susan Williams Gifford', 'Steven S. Howitt', 'Todd M. Smola', 'Elizabeth A. Poirier', 'Bruce E. Tarr', 'Kimberly N. Ferguson', 'Brian M. Ashe', 'Angelo J. Puppolo Jr.', 'Hannah Kane', 'Mathew J. Muratore', "Angelo L. D'Emilia", 'Colleen M. Garry', 'Josh S. Cutler', 'Bradley H. Jones Jr.', 'James J. Dwyer', 'Carole A. Fiola']

Group 8 (size 11): ['Chris Walsh', 'Paul R. Heroux', 'Michelle M. DuBois', "Barbara A. L'Italien", 'José F. Tosado', 'Daniel J. Ryan', 'Aaron Vega', 'Alice Hanlon Peisch', 'Danielle W. Gregoire', 'John H. Rogers', 'John J. Lawn Jr.']

Group 9 (size 4): ['Michael D. Brady', 'Diana DiZoglio', 'Tackey Chan', 'Frank A. Moran']

Group 10 (size 3): ["James J. O'Day", 'Daniel M. Donahue', 'Antonio F. D. Cabral']

Group 11 (size 4): ['Paul McMurtry', 'Denise C. Garlick', 'James Arciero', 'William C. Galvin']

Group 12 (size 3): ['Tricia Farley-Bouvier', 'Christine P. Barber', 'David Paul Linsky']

Group 13 (size 3): ['Steven Ultrino', 'Paul W. Mark', 'Joseph W. McGonagle Jr.']

Group 0 (size 3): ['Jay D. Livingstone', 'Kevin G. Honan', 'Edward F. Coppinger']
"""

# turn into dictionary
import ast

groups = {}
for line in group_text.strip().split("\n"):
    if ": [" in line:
        group_name, members = line.split(": ", 1)
        group_name = group_name.split(" (")[0]  # remove" (size X)"
        members = ast.literal_eval(members)
        groups[group_name] = members

# result
groups

{'Group 2': ['Jason M. Lewis',
  'Kay Khan',
  'Denise Provost',
  'Marjorie C. Decker',
  'Ruth B. Balser',
  'Jonathan Hecht',
  'Jennifer E. Benson',
  'Kenneth I. Gordon',
  'Carolyn C. Dykema',
  'Louis L. Kafka',
  'Lori A. Ehrlich',
  'Frank I. Smizik',
  'Peter V. Kocot',
  'Sarah K. Peake',
  'John W. Scibak',
  'Kate Hogan'],
 'Group 1': ['James B. Eldridge',
  'Sal N. DiDomenico',
  'Patricia D. Jehlen',
  'Thomas M. Stanley',
  'Michael O. Moore',
  "Patrick M. O'Connor",
  'Michael J. Barrett',
  'Walter F. Timilty',
  'Anne M. Gobi',
  'Brendan P. Crighton'],
 'Group 3': ['Lindsay N. Sabadosa',
  'Natalie M. Higgins',
  'Jack Patrick Lewis',
  'Mike Connolly',
  'Rebecca L. Rausch',
  'Tommy Vitolo',
  'Carmine Lawrence Gentile',
  'James K. Hawkins',
  'David Henry Argosky LeBoeuf',
  'Carlos González',
  'Joanne M. Comerford',
  'Tram T. Nguyen',
  'Brian W. Murray',
  'Adrian C. Madaro',
  'Maria Duaime Robinson',
  'Michelle L. Ciccolo',
  'Tami L. Gouveia',
  'Mindy 

In [15]:
# Maybe not used

# read data
keywords_df = pd.read_csv("keyword.csv", encoding="ISO-8859-1")

# Calculate the keyword frequency for each group
group_keywords = {}

for group, members in groups.items():
    group_keywords[group] = Counter()

    for member in members:
        member_keywords = keywords_df[keywords_df["sponsor_name"] == member]
        for col in member_keywords.columns[1:]:  # skip the sponsor_name column
            for word in member_keywords[col].dropna():
                group_keywords[group][word] += 1

# Convert to a DataFrame, showing the Top 10 keywords for each group
group_keywords_df = pd.DataFrame([
    {"group": group, **{f"keyword_{i+1}": kw[0] for i, kw in enumerate(words.most_common(10))}}
    for group, words in group_keywords.items()
])

# result
print(group_keywords_df)

       group keyword_1     keyword_2     keyword_3     keyword_4  \
0    Group 2    health        access        public          care   
1    Group 1    health        public        access  commonwealth   
2    Group 3    health        public        access          care   
3    Group 4    health        public  commonwealth     education   
4    Group 5    health        public  commonwealth        access   
5    Group 6    health        public          care     education   
6    Group 7    public  commonwealth        health     education   
7    Group 8    health          care  commonwealth        public   
8    Group 9    health        public          care  commonwealth   
9   Group 10    health          care        public  commonwealth   
10  Group 11    public        health          care        access   
11  Group 12    public        health  commonwealth     education   
12  Group 13    health        public        access          care   
13   Group 0    health        public  commonweal

In [17]:
# Text analysis for each group
group_text_analysis = {}

for group, members in groups.items():
    group_text_analysis[group] = Counter()

    for member in members:
        # Find all rows in df where the member appears (sponsor or cosponsor)
        
        # Check the sponsors column to see if the member is in it
        relevant_rows = df[df["sponsor_name"] == member]
        # Check the cosponsors column to see if the member is in it
        cosponsor_rows = df[df["cosponsors"].str.contains(member, na=False, regex=False)]
        
        # Combine sponsor and cosponsor related lines
        relevant_rows = pd.concat([relevant_rows, cosponsor_rows]).drop_duplicates()

        # Iterate over all relevant rows and extract keywords
        for _, row in relevant_rows.iterrows():
            keywords = extract_keywords(row["bill_title"])  # 提取关键词
            group_text_analysis[group].update(keywords)

# Convert to a DataFrame, showing the Top 10 keywords for each group
group_text_analysis_df = pd.DataFrame([
    {"group": group, **{f"keyword_{i+1}": kw[0] for i, kw in enumerate(words.most_common(10))}}
    for group, words in group_text_analysis.items()
])

# result
print(group_text_analysis_df)

       group keyword_1 keyword_2     keyword_3     keyword_4     keyword_5  \
0    Group 2    health    access        public          care  commonwealth   
1    Group 1    health    public          care        access  commonwealth   
2    Group 3    public    health        access  commonwealth          care   
3    Group 4    health    public        access  commonwealth     education   
4    Group 5    health    access        public  commonwealth          care   
5    Group 6    health    public          care     education        access   
6    Group 7    public    health  commonwealth          care        safety   
7    Group 8    health    public          care        access  commonwealth   
8    Group 9    health    public          care     education  commonwealth   
9   Group 10    health    public          care     education        access   
10  Group 11    health    public          care  commonwealth        access   
11  Group 12    health    public        access  commonwealth    

In [None]:
For seeing those legislators who move outside their associated groups (Potential special interests, corruption etc)

In [18]:
# Calculate the Jaccard similarity of each legislator to its group

# Create a dictionary to store each legislator's group affiliation
legislator_to_group = {member: group for group, members in groups.items() for member in members}

# Caculate the Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Generate a similarity DataFrame
similarity_results = []

for _, row in sponsor_keywords_df.iterrows():
    legislator = row["sponsor_name"]
    legislator_keywords = set(row.drop("sponsor_name").dropna().values)  # The legislator's key words
    
    # Find the group to which the legislator belongs
    group = legislator_to_group.get(legislator, None)
    
    if group and group in group_text_analysis_df["group"].values:
        # The group's key words
        group_keywords = set(group_text_analysis_df[group_text_analysis_df["group"] == group].iloc[:, 1:].values.flatten())
        group_keywords.discard(None)  # Remove None value
        
        # Calculate the Jaccard similarity
        similarity = jaccard_similarity(legislator_keywords, group_keywords)
        
        # Save the result
        similarity_results.append({"legislator": legislator, "group": group, "jaccard_similarity": similarity})

# Turn into DataFrame
jaccard_similarity_df = pd.DataFrame(similarity_results)

# result
print(jaccard_similarity_df.head())

            legislator    group  jaccard_similarity
0  Lindsay N. Sabadosa  Group 3            0.818182
1   Mathew J. Muratore  Group 7            0.818182
2      Brian W. Murray  Group 3            0.538462
3     Colleen M. Garry  Group 7            0.666667
4        Brian M. Ashe  Group 7            0.666667


In [21]:
# Select Legislators with Jaccard similarity less than 0.3 were screened
low_similarity_legislators_df = jaccard_similarity_df[jaccard_similarity_df["jaccard_similarity"] < 0.5]

# result
print(low_similarity_legislators_df)

        legislator    group  jaccard_similarity
62  John H. Rogers  Group 8            0.428571
82   Todd M. Smola  Group 7            0.333333


In [25]:
selected_legislators_df = sponsor_keywords_df[sponsor_keywords_df["sponsor_name"].isin(["John H. Rogers", "Todd M. Smola"])]
print(selected_legislators_df)

       sponsor_name keyword_1     keyword_2  keyword_3 keyword_4 keyword_5  \
120  John H. Rogers    public        health  education      care  services   
191   Todd M. Smola    public  commonwealth     safety   service  services   

    keyword_6     keyword_7  keyword_8 keyword_9 keyword_10  
120     covid  commonwealth  emergency   program       town  
191      fund     education       town  benefits      state  


In [None]:
group 8: health, public, care, access, commonwealth, education, safety, services, protect, promote 
group 7: public, health, commonwealth, care, safety, education, access, school, state, program
John H. has extra attention to: access, safety, protect, promote
Todd M. has extra attention to: health, care, access, school, program