In [2]:
% load_ext autoreload
% autoreload 2

In [20]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [4]:
from shared.schema import DatasetSchema

DATASET = DatasetSchema.load_schema('house-of-representatives-congress-116')
DATASET.save_schema()

In [5]:
spark = (SparkSession.builder
         .appName(f'{DATASET}')
         .config('spark.sql.legacy.timeParserPolicy', 'LEGACY')
         .config("spark.executor.memory", "8g")
         .config("spark.driver.memory", "8g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "16g")
         .getOrCreate())

22/01/25 19:05:49 WARN Utils: Your hostname, megatron resolves to a loopback address: 127.0.1.1; using 192.168.1.89 instead (on interface enp7s0)
22/01/25 19:05:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/25 19:05:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [30]:
udf_eval_array = F.udf(lambda x: eval(x), T.ArrayType(T.StringType()))

In [34]:
df_legislation = (
    spark.read.csv(DATASET.raw_str('house_legislation_116.csv'), header=True, inferSchema=True, quote='"', escape='"')
        .withColumn('date_introduced', F.to_timestamp(F.col('date_introduced'), 'yyyy-MM-dd'))
        .withColumn('cosponsors', udf_eval_array(F.col('cosponsors')))
        .withColumn('related_bills', udf_eval_array(F.col('related_bills')))
        .withColumn('subjects', udf_eval_array(F.col('subjects')))
        .withColumn('committees', udf_eval_array(F.col('committees')))
)
df_legislation.head(5)

[Row(bill_id='H.R.1', title="To expand Americans' access to the ballot box, reduce the influence of big money in politics, and strengthen ethics rules for public servants, and for other purposes.", sponsor='S001168', cosponsors=['P000197', 'A000370', 'A000376', 'B001300', 'B001281', 'B001292', 'B000490', 'B000574', 'B001278', 'B001296', 'B001304', 'B001285', 'B001286', 'B001251', 'C001112', 'C001097', 'C001090', 'C001055', 'C001117', 'C001066', 'C001080', 'C001084', 'C001123', 'C001101', 'C001067', 'C001049', 'C000537', 'C001068', 'C001078', 'C000754', 'C001110', 'C001069', 'C001124', 'C001119', 'C001111', 'C001121', 'C000984', 'D000629', 'D000096', 'D000598', 'D000631', 'D000191', 'D000197', 'D000216', 'D000617', 'D000630', 'D000627', 'D000623', 'D000610', 'D000624', 'D000482', 'E000179', 'E000299', 'E000215', 'E000297', 'E000296', 'F000468', 'F000462', 'G000571', 'G000574', 'G000559', 'G000587', 'G000586', 'G000592', 'G000585', 'G000553', 'G000551', 'H001080', 'H001090', 'H000324', '

In [37]:
df_members = (
    spark.read.csv(DATASET.raw_str('house_members_116.csv'), header=True, inferSchema=True, quote='"', escape='"')
        .withColumn('committee_assignments', udf_eval_array(F.col('committee_assignments')))
)
df_members.head(5)

[Row(name_id='A000374', name='ralph-abraham', state='Louisiana', url='https://www.congress.gov/member/ralph-abraham/A000374', chamber='House', current_party='Republican', committee_assignments=['Agriculture', 'Armed Services']),
 Row(name_id='A000370', name='alma-adams', state='North Carolina', url='https://www.congress.gov/member/alma-adams/A000370', chamber='House', current_party='Democratic', committee_assignments=['Agriculture', 'Education and Labor', 'Financial Services']),
 Row(name_id='A000055', name='robert-aderholt', state='Alabama', url='https://www.congress.gov/member/robert-aderholt/A000055', chamber='House', current_party='Republican', committee_assignments=['Appropriations']),
 Row(name_id='A000371', name='pete-aguilar', state='California', url='https://www.congress.gov/member/pete-aguilar/A000371', chamber='House', current_party='Democratic', committee_assignments=['Appropriations', 'House Administration']),
 Row(name_id='A000372', name='rick-allen', state='Georgia', url

In [38]:
df_rollcall_info = (
    spark.read.csv(DATASET.raw_str('house_rollcall_info_116.csv'), header=True, inferSchema=True)
        .withColumn('date', F.concat(F.col('date'), F.lit(' '), F.col('year')))
        .withColumn('date', F.to_timestamp(F.col('date'), 'd-MMM yyyy'))
        .drop('year')
)
df_rollcall_info.head(5)

[Row(rollcall_id='2019:001', roll_num=1, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='QUORUM', question='Call by States', result='P', description=None, congress=116, session=1),
 Row(rollcall_id='2019:002', roll_num=2, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id=None, question='Election of the Speaker', result='P', description='\xa0', congress=116, session=1),
 Row(rollcall_id='2019:003', roll_num=3, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='H.RES.5', question='On Motion to Table the Motion to Refer', result='P', description='Providing for consideration of H.Res. 6, adopting the Rules of the House of Representatives for the 116th Congress; providing for consideration of H.R. 21, making appropriations for the FY ending September 30, 2019, and for other purposes', congress=116, session=1),
 Row(rollcall_id='2019:004', roll_num=4, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='H.RES.5', question='On Ordering the Previous Question', result='P', description='Providi

In [39]:
def to_explode(df, by):
    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = F.explode(F.array([
        F.struct(F.lit(c).alias("rollcall_id"), F.col(c).alias("vote")) for c in cols
    ])).alias("kvs")

    return df.select(by + [kvs]).select(by + ["kvs.rollcall_id", "kvs.vote"])

In [63]:
udf_vote_val = F.udf(lambda x: (1 if x in ["Aye", "Yea"] else -1 if x in ["No", "Nay"] else 0), T.IntegerType())

In [64]:
df_rollcall_votes = to_explode(
    spark.read.csv(DATASET.raw_str('house_rollcall_votes_116.csv'), header=True, inferSchema=True),
    ['name_id'],
).cache().withColumn('vote', udf_vote_val(F.col('vote')))
df_rollcall_votes.head(5)

22/01/25 21:00:22 WARN CacheManager: Asked to cache already cached data.


[Row(name_id='A000374', rollcall_id='2019:641', vote=-1),
 Row(name_id='A000374', rollcall_id='2019:639', vote=1),
 Row(name_id='A000374', rollcall_id='2019:640', vote=-1),
 Row(name_id='A000374', rollcall_id='2019:635', vote=1),
 Row(name_id='A000374', rollcall_id='2019:636', vote=1)]

In [44]:
df_nodes_members = (
    df_members
        .withColumnRenamed('name_id', 'id')
        .dropDuplicates(['id'])
)
print(df_nodes_members.count())
df_nodes_members.head(5)

443


[Row(id='A000055', name='robert-aderholt', state='Alabama', url='https://www.congress.gov/member/robert-aderholt/A000055', chamber='House', current_party='Republican', committee_assignments=['Appropriations']),
 Row(id='A000367', name='justin-amash', state='Michigan', url='https://www.congress.gov/member/justin-amash/A000367', chamber='House', current_party='Independent', committee_assignments=[]),
 Row(id='A000369', name='mark-amodei', state='Nevada', url='https://www.congress.gov/member/mark-amodei/A000369', chamber='House', current_party='Republican', committee_assignments=['Appropriations']),
 Row(id='A000370', name='alma-adams', state='North Carolina', url='https://www.congress.gov/member/alma-adams/A000370', chamber='House', current_party='Democratic', committee_assignments=['Agriculture', 'Education and Labor', 'Financial Services']),
 Row(id='A000371', name='pete-aguilar', state='California', url='https://www.congress.gov/member/pete-aguilar/A000371', chamber='House', current_p

In [43]:
df_nodes_commitee = (
    df_nodes_members.select(F.explode('committee_assignments').alias('id'))
        .union(df_legislation.select(F.explode('committees').alias('id')))
        .dropDuplicates(['id'])
)
print(df_nodes_commitee.count())
df_nodes_commitee.head(5)

                                                                                

95


                                                                                

[Row(id='Energy and Commerce'),
 Row(id='Oversight and Reform'),
 Row(id='Science, Space, and Technology'),
 Row(id='Transportation and Infrastructure'),
 Row(id='Homeland Security')]

In [67]:
df_nodes_bills = (
    df_legislation
        .withColumnRenamed('bill_id', 'id')
        .dropDuplicates(['id'])
)
print(df_nodes_bills.count())
df_nodes_bills.head(5)

5806


[Row(id='H.Con.Res.10', title='Authorizing the use of the rotunda of the Capitol to honor the last surviving Medal of Honor recipient of the Second World War upon death.', sponsor='M001205', cosponsors=['M001195', 'M001187', 'G000578', 'M001180', 'W000806', 'R000611', 'C001114', 'B001305', 'C001118', 'H001053', 'R000582', 'H001088', 'B001298', 'C001092', 'B001306', 'P000258', 'F000459', 'L000578', 'A000378', 'M001184', 'B000490', 'S001165', 'B001248', 'B001302', 'H001082', 'S001214'], related_bills=['H.R.2500', 'S.1790'], policy_area='Congress', subjects=['Conflicts and wars', 'Congressional tributes', 'Military history'], committees=['House Administration'], bill_progress='Introduced', summary='This concurrent resolution authorizes the use of the rotunda of the Capitol to honor, upon death, the last surviving Medal of Honor recipient for acts performed during World War II, at the election of the individual or their next of kin. This concurrent resolution authorizes the use of the rotu

In [49]:
df_nodes_rolecalls = (
    df_rollcall_info
        .withColumnRenamed('rollcall_id', 'id')
        .drop('congress', 'session')
        .dropDuplicates(['id'])
)
print(df_nodes_rolecalls.count())
df_nodes_rolecalls.head(5)

642


[Row(id='2019:001', roll_num=1, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='QUORUM', question='Call by States', result='P', description=None),
 Row(id='2019:002', roll_num=2, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id=None, question='Election of the Speaker', result='P', description='\xa0'),
 Row(id='2019:003', roll_num=3, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='H.RES.5', question='On Motion to Table the Motion to Refer', result='P', description='Providing for consideration of H.Res. 6, adopting the Rules of the House of Representatives for the 116th Congress; providing for consideration of H.R. 21, making appropriations for the FY ending September 30, 2019, and for other purposes'),
 Row(id='2019:004', roll_num=4, date=datetime.datetime(2019, 1, 3, 0, 0), bill_id='H.RES.5', question='On Ordering the Previous Question', result='P', description='Providing for consideration of H.Res. 6, adopting the Rules of the House of Representatives for the 116th Congress; H.

In [72]:
df_node_ids = (
    df_nodes_members.select('id')
        .union(df_nodes_commitee.select('id'))
        .union(df_nodes_bills.select('id'))
        .union(df_nodes_rolecalls.select('id'))
        .distinct()
)
print(df_node_ids.count())

def filter_node_ids(df):
    return df.join(
        df_node_ids,
        F.col('src') == F.col('id'),
        'inner'
    ).drop(
        'id'
    ).join(
        df_node_ids,
        F.col('dst') == F.col('id'),
        'inner'
    ).drop('id')



6986


                                                                                

In [73]:
df_edges_sponsor = filter_node_ids(
    df_legislation
        .withColumnRenamed('sponsor', 'src')
        .withColumnRenamed('bill_id', 'dst')
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_sponsor.count())
df_edges_sponsor.head(5)

                                                                                

5806


                                                                                

[Row(dst='H.J.Res.56', title='Directing the President to terminate the use of the United States Armed Forces with respect to the military intervention led by Saudi Arabia in the Republic of Yemen.', src='A000367', cosponsors=[], related_bills=[], policy_area='International Affairs', subjects=['Conflicts and wars', 'Congressional oversight', 'Middle East', 'Military operations and strategy', 'Saudi Arabia', 'War and emergency powers', 'Yemen'], committees=['House Foreign Affairs'], bill_progress='Introduced', summary=None, date_introduced=datetime.datetime(2019, 4, 10, 0, 0), number=56, bill_type='H.J.Res.'),
 Row(dst='H.R.1910', title='To abolish the Export-Import Bank of the United States, and for other purposes.', src='A000367', cosponsors=['J000289', 'M001184', 'D000616', 'M001177', 'B001297', 'R000614', 'G000590'], related_bills=[], policy_area='Foreign Trade and International Finance', subjects=['Competitiveness, trade promotion, trade deficits', 'Department of the Treasury', 'Exe

In [74]:
df_edges_cosponsor = filter_node_ids(
    df_legislation
        .select(F.explode('cosponsors').alias('cosponsor'), 'bill_id')
        .withColumnRenamed('cosponsor', 'src')
        .withColumnRenamed('bill_id', 'dst')
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_cosponsor.count())
df_edges_cosponsor.head(5)

                                                                                

102964


                                                                                

[Row(src='R000613', dst='H.R.1'),
 Row(src='H001064', dst='H.R.6'),
 Row(src='P000197', dst='H.R.7'),
 Row(src='P000613', dst='H.R.8'),
 Row(src='L000397', dst='H.R.8')]

In [75]:
df_edges_commitee_bill = filter_node_ids(
    df_legislation
        .select(F.explode('committees').alias('commitee'), 'bill_id')
        .withColumnRenamed('commitee', 'src')
        .withColumnRenamed('bill_id', 'dst')
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_commitee_bill.count())
df_edges_commitee_bill.head(5)

                                                                                

8145


                                                                                

[Row(src='House Energy and Commerce', dst='H.R.148'),
 Row(src='House Small Business', dst='H.R.539'),
 Row(src='Senate Banking, Housing, and Urban Affairs', dst='H.R.550'),
 Row(src='House Judiciary', dst='H.R.1324'),
 Row(src='House Rules', dst='H.R.1332')]

In [76]:
df_edges_commitee_member = filter_node_ids(
    df_members
        .select(F.explode('committee_assignments').alias('commitee'), 'name_id')
        .withColumnRenamed('name_id', 'src')
        .withColumnRenamed('commitee', 'dst')
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_commitee_member.count())
df_edges_commitee_member.head(5)

                                                                                

851


                                                                                

[Row(dst='Oversight and Reform', src='C001115'),
 Row(dst='Science, Space, and Technology', src='C001115'),
 Row(dst='Energy and Commerce', src='M001163'),
 Row(dst='Homeland Security', src='P000604'),
 Row(dst='Transportation and Infrastructure', src='P000604')]

In [77]:
df_edges_related_bill = filter_node_ids(
    df_legislation
        .select(F.explode('related_bills').alias('related_bill'), 'bill_id')
        .withColumnRenamed('related_bill', 'src')
        .withColumnRenamed('bill_id', 'dst')
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_related_bill.count())
df_edges_related_bill.head(5)

                                                                                

1885


                                                                                

[Row(src='H.R.289', dst='H.R.1401'),
 Row(src='H.R.1373', dst='H.Res.656'),
 Row(src='H.R.1849', dst='H.R.731'),
 Row(src='H.R.3456', dst='H.R.1411'),
 Row(src='H.R.4108', dst='H.R.4674')]

In [78]:
df_edges_rollcall_bill = filter_node_ids(
    df_rollcall_info
        .withColumnRenamed('rollcall_id', 'src')
        .withColumnRenamed('bill_id', 'dst')
        .select([F.col('src'), F.col('dst')])
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_rollcall_bill.count())
df_edges_rollcall_bill.head(5)

                                                                                

471


                                                                                

[Row(src='2019:399', dst='H.R.3055'),
 Row(src='2019:496', dst='H.R.582'),
 Row(src='2019:585', dst='H.R.2440'),
 Row(src='2019:431', dst='H.R.2515'),
 Row(src='2019:502', dst='H.R.3375')]

In [79]:
df_edges_rollcall_vote = filter_node_ids(
    df_rollcall_votes
        .withColumnRenamed('name_id', 'src')
        .withColumnRenamed('rollcall_id', 'dst')
        .withColumnRenamed('vote', 'vote')
        .select(['src', 'dst', 'vote'])
        .dropDuplicates(['src', 'dst'])
)
print(df_edges_rollcall_vote.count())
df_edges_rollcall_vote.head(5)

                                                                                

284406


                                                                                

[Row(src='A000374', dst='2019:151', vote=1),
 Row(src='A000374', dst='2019:221', vote=0),
 Row(src='A000374', dst='2019:580', vote=-1),
 Row(src='A000370', dst='2019:317', vote=1),
 Row(src='A000370', dst='2019:219', vote=1)]

In [80]:
df_nodes_members.write.parquet(DATASET.processed_str('nodes_Member'), mode='overwrite')
df_nodes_commitee.write.parquet(DATASET.processed_str('nodes_Commitee'), mode='overwrite')
df_nodes_bills.write.parquet(DATASET.processed_str('nodes_Bill'), mode='overwrite')
df_nodes_rolecalls.write.parquet(DATASET.processed_str('nodes_Rollcall'), mode='overwrite')

df_edges_sponsor.write.parquet(DATASET.processed_str('edges_SPONSORS'), mode='overwrite')
df_edges_cosponsor.write.parquet(DATASET.processed_str('edges_COSPONSORS'), mode='overwrite')
df_edges_commitee_bill.write.parquet(DATASET.processed_str('edges_COMMITEES_BILL'), mode='overwrite')
df_edges_commitee_member.write.parquet(DATASET.processed_str('edges_MEMBER_OF_COMMITEE'), mode='overwrite')
df_edges_related_bill.write.parquet(DATASET.processed_str('edges_RELATED_TO'), mode='overwrite')
df_edges_rollcall_bill.write.parquet(DATASET.processed_str('edges_ROLLCALL_FOR'), mode='overwrite')
df_edges_rollcall_vote.write.parquet(DATASET.processed_str('edges_VOTED_FOR'), mode='overwrite')

                                                                                

In [81]:
from shared.schema.graph import GraphSchema, NodeSchema, EdgeSchema

(
    GraphSchema()
        .add_node_schema('Member', NodeSchema.from_spark(df_nodes_members.schema, label='name'))
        .add_node_schema('Commitee', NodeSchema.from_spark(df_nodes_commitee.schema, label='id'))
        .add_node_schema('Bill', NodeSchema.from_spark(df_nodes_bills.schema, label='title', timestamp='date_introduced', interaction=False))
        .add_node_schema('Rollcall', NodeSchema.from_spark(df_nodes_rolecalls.schema, label='question', timestamp='date', interaction=True))

        .add_edge_schema('SPONSORS', EdgeSchema.from_spark(df_edges_sponsor.schema, source_type='Member', target_type='Bill', directed=True))
        .add_edge_schema('COSPONSORS', EdgeSchema.from_spark(df_edges_cosponsor.schema, source_type='Member', target_type='Bill', directed=True))
        .add_edge_schema('COMMITEES_BILL', EdgeSchema.from_spark(df_edges_commitee_bill.schema, source_type='Commitee', target_type='Bill', directed=True))
        .add_edge_schema('MEMBER_OF_COMMITEE', EdgeSchema.from_spark(df_edges_commitee_member.schema, source_type='Member', target_type='Commitee', directed=True))
        .add_edge_schema('RELATED_TO', EdgeSchema.from_spark(df_edges_related_bill.schema, source_type='Bill', target_type='Bill', directed=True))
        .add_edge_schema('ROLLCALL_FOR', EdgeSchema.from_spark(df_edges_rollcall_bill.schema, source_type='Rollcall', target_type='Bill', directed=True))
        .add_edge_schema('VOTED_FOR', EdgeSchema.from_spark(df_edges_rollcall_vote.schema, source_type='Member', target_type='Rollcall', directed=True))
        .save_schema(DATASET.processed())
)

GraphSchema(_path=PosixPath('/dd_volume/Development/Python/Thesis/code/storage/datasets/processed/house-of-representatives-congress-116'), nodes={'Member': NodeSchema(_type='Member', _schema=..., label='name', properties={'id': GraphProperty(_name='id', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'name': GraphProperty(_name='name', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'state': GraphProperty(_name='state', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'url': GraphProperty(_name='url', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'chamber': GraphProperty(_name='chamber', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'current_party': GraphProperty(_name='current_party', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=False)), 'committee_assignments': GraphProperty(_name='committee_assignments', dtype=DType(atomic=<DTypeAtomic.STRING: 'string'>, array=True))}, dynamic=No