In [None]:
import numpy as np
import scipy
import pandas as pd
import glob
import yaml
import json
from pandas import json_normalize

In [None]:
#clones tweet data repo
!git clone https://github.com/alexlitel/congresstweets.git

#change directory
%cd congresstweets/data

#creates a list of all the json files
json_files = glob.glob('2018*.json') + glob.glob('2019*.json') + glob.glob('2020*.json') + glob.glob('2021*.json') + glob.glob('2022*.json')

#empty list that we will populate with the json dfs
dataframes = []

#loops through all the files and creates a dataframe of dataframes
for file in json_files:
    df = pd.read_json(file)

    columns_to_keep = ['screen_name', 'text', 'time']
    df = df[columns_to_keep]

    dataframes.append(df)

#concatenates all the dfs
combined_df = pd.concat(dataframes, ignore_index=True)

#groups tweets by the twitter account
grouped_df = combined_df.groupby('screen_name')

#empty list to hold dataframes
sampled_dataframes = []

#filters out accounts with fewer than 250 tweets and samples up to 1000 tweets from each remaining account
for account, tweets in grouped_df:
    if len(tweets) >= 250:
        if len(tweets) > 1000:
            tweets = tweets.sample(n=1000, random_state=1)
        sampled_dataframes.append(tweets)

#concatenates all sampled dataframes
sampled_combined_df = pd.concat(sampled_dataframes, ignore_index=True)

print(f"Total number of tweets after sampling and filtering: {len(sampled_combined_df)}")
print(f"Number of tweets per account (should be <= 1000 and >= 250):")
print(sampled_combined_df.groupby('screen_name').size().describe())
print(sampled_combined_df)

In [None]:
!curl -O https://raw.githubusercontent.com/alexlitel/congresstweets-automator/master/data/historical-users-filtered.json

In [None]:
litel = '/content/historical-users-filtered.json'

#loads it in as json
with open(litel, 'r') as file:
    json_data = json.load(file)

#normalizes the json (flattens)
df_histusers = json_normalize(json_data)

#extracts needed, nested fields
df_histusers["bioguide"] = df_histusers['id.bioguide']
df_histusers["screen_name"] = df_histusers['accounts'].apply(lambda x: x[0]["screen_name"] if isinstance(x, list) and len(x) > 0 else None)

#drops columns
df_histusers = df_histusers.drop(columns=['chamber','type','id.tag','id.thomas_id','id.senate_committee_id','prev_props','state','id.house_committee_id','accounts','id.bioguide' ])

print(df_histusers)

In [None]:
!curl -O https://raw.githubusercontent.com/unitedstates/congress-legislators/main/legislators-social-media.yaml

In [None]:
legis = '/content/legislators-social-media.yaml'
with open(legis, 'r') as file:
    yaml_data = yaml.safe_load(file)
leg_media = pd.DataFrame(yaml_data)

#lists to store extractions
bioguide_list = []
govtrack_list = []
twitter_list = []

#iterates over each row
for row in yaml_data:
    #extracts 'bioguide' and 'govtrack' from the 'id' field
    bioguide = row['id']['bioguide']
    govtrack = row['id'].get('govtrack')  #uses .get() to handle missing 'govtrack'(s)

    #extracts 'twitter' from the 'social' field
    twitter = row['social'].get('twitter')  #uses .get() to handle missing 'twitter'(s)

    #appends data to lists
    bioguide_list.append(bioguide)
    govtrack_list.append(govtrack)
    twitter_list.append(twitter)

#makes a dataframe out of the lists
df_leg_media = pd.DataFrame({
    'bioguide': bioguide_list,
    'govtrack': govtrack_list,
    'twitter': twitter_list
})

print(df_leg_media)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/voteview.csv'

voteview = pd.read_csv(file_path) #loads in the vv data

voteview = voteview.drop(columns=['nominate_dim2','icpsr']) #drops columns

In [None]:
vv_merg_hist = pd.merge(voteview, df_histusers, on='bioguide', how='left') #merges the two
print(vv_merg_hist)

In [None]:
all_merged = pd.merge(vv_merg_hist, df_leg_media, on='bioguide', how='left') #merges the third one
all_merged = all_merged[all_merged['bioname'] != 'TRUMP, Donald John'] #removed all NAs, i think
all_merged = all_merged.reset_index(drop=True) #resets row numbers
all_merged = all_merged.drop(columns=['id.govtrack','govtrack','twitter','party_code']) #drops columns

print(all_merged)

In [None]:
final_merged = pd.merge(sampled_combined_df, all_merged, on='screen_name', how='inner') #merges the three with the twitter stuff
final_merged = final_merged.drop_duplicates(subset=['text'], keep='first') #gets rid of duplicate tweets
final_merged = final_merged.reset_index(drop=True) #resets row numbers

print(final_merged)

In [None]:
#fm_path = '/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/final_merged.csv'
#final_merged.to_csv(fm_path, index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
final_merged_path = '/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/final_merged.csv'
final_merged = pd.read_csv(final_merged_path)

In [None]:
final_merged = final_merged.dropna() #removes rows with missing values, doing this later bc I forgot to

print(final_merged.sample(10, replace=False))
print(final_merged)

In [None]:
#fm_path = '/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/final_merged.csv'
#final_merged.to_csv(fm_path, index=False)