# Master Thesis Script
## Group word labelling

Labelling in- and out-group words in tweets from our dataset, using dictionaries from Rathje et al. (2021), where dictionaries with a list of politicians were extended by the author to represent the current Congress.

In [69]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import regex as re
import json

In [2]:
# Opening JSON file
f = open(r'Merged_Data_20230501_2.json')
  
# returns JSON object as 
# a dictionary
data = pd.read_json(f)
  
# Closing file
f.close()

In [3]:
data_OG = data[(data['referenced_tweets.retweeted.id'] == "None")]

In [4]:
# identity terms list
LiberalIdentity = pd.read_csv(r'LiberalIdentity.csv')
ConservativeIdentity = pd.read_csv(r'ConservativeIdentity.csv')

In [5]:
LiberalIdentity.iloc[0] = LiberalIdentity.columns
LiberalIdentity["term"] = LiberalIdentity["socialist"]
LiberalIdentity_lst = LiberalIdentity.term.values.tolist()
ConservativeIdentity.iloc[0] = ConservativeIdentity.columns
ConservativeIdentity["term"] = ConservativeIdentity["conservative"]
ConservativeIdentity_lst = ConservativeIdentity.term.values.tolist()

In [6]:
# most famous politicians list

MostFamousDem = pd.read_csv(r'MostFamousDemocrats.csv')
MostFamousRep = pd.read_csv(r'MostFamousRepublicans.csv')
MostFamousRep.iloc[0] = MostFamousRep.columns
MostFamousRep["Name"] = MostFamousRep["Trump"]
MostFamousDem_lst = MostFamousDem.Name.values.tolist()
MostFamousRep_lst= MostFamousRep.Name.values.tolist()
MostFamousDem_lst = [item.lower() for item in MostFamousDem_lst]
MostFamousRep_lst = [item.lower() for item in MostFamousRep_lst]

In [7]:
# Congress Member list

CongressMembers_Dem = pd.read_csv(r'Congress_Democrat_Final_Outgroup_2.csv')
CongressMembers_Rep = pd.read_csv(r'Congress_Republican_Final_Outgroup_2.csv')

In [8]:
CongressMembers_Dem_names = CongressMembers_Dem.Name.values.tolist()
CongressMembers_Rep_names = CongressMembers_Rep.Name.values.tolist()
CongressMembers_Dem_twitter = CongressMembers_Dem.Twitter.values.tolist()
CongressMembers_Rep_twitter = CongressMembers_Rep.Twitter.values.tolist()

In [9]:
CongressMembers_Dem_twitter_2 = []
pattern = r"(@\w+)'"
#print(CongressMembers_Dem_twitter)
for item in CongressMembers_Dem_twitter:
    match = re.search(pattern, item, re.IGNORECASE)
    if match != None:
        item_new = match.group(1)
        CongressMembers_Dem_twitter_2.append(item_new)
        shortened_item = re.sub(item_new, "", item)
        #second round
        match2 = re.search(pattern, shortened_item, re.IGNORECASE)
        if match2 != None:
            item_new2 = match2.group(1)
            CongressMembers_Dem_twitter_2.append(item_new2)
            shortened_item2 = re.sub(item_new2, "", shortened_item)
        # third round
            match3 = re.search(pattern, shortened_item2, re.IGNORECASE)
            if match3 != None:
                item_new3 = match3.group(1)
                CongressMembers_Dem_twitter_2.append(item_new3)
    

In [10]:
CongressMembers_Rep_twitter_2 = []
pattern = r"(@\w+)'"

for item in CongressMembers_Rep_twitter:
    match = re.search(pattern, item, re.IGNORECASE)
    if match != None:
        item_new = match.group(1)
        CongressMembers_Rep_twitter_2.append(item_new)
        shortened_item = re.sub(item_new, "", item)
        #second round
        match2 = re.search(pattern, shortened_item, re.IGNORECASE)
        if match2 != None:
            item_new2 = match2.group(1)
            CongressMembers_Rep_twitter_2.append(item_new2)
            shortened_item2 = re.sub(item_new2, "", shortened_item)
        # third round
            match3 = re.search(pattern, shortened_item2, re.IGNORECASE)
            if match3 != None:
                item_new3 = match3.group(1)
                CongressMembers_Rep_twitter_2.append(item_new3)

In [11]:
Congress_Members_Dem_Use = CongressMembers_Dem_names + CongressMembers_Dem_twitter_2
Congress_Members_Rep_Use = CongressMembers_Rep_names + CongressMembers_Rep_twitter_2

In [12]:
# remove emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [13]:
tweet_text = data_OG.text.values.tolist()
tweet_text_lst_clean = []
for item in tweet_text:
    item_new = re.sub("&amp;", "&", item)
    item_new = re.sub("https.*", "", item_new)
    #item_new = re.sub("@\w+", "", item_new)
    item_new = deEmojify(item_new)
    item_new = item_new.replace('\\n', ' ')
    item_new = item_new.replace('\\', '')
    tweet_text_lst_clean.append(item_new)

In [14]:
tweet_party = data_OG.Party.values.tolist()

In [15]:
tweet_text_lst_clean = [tweet.lower() for tweet in tweet_text_lst_clean]
Congress_Members_Dem_Use = [item.lower() for item in Congress_Members_Dem_Use]
Congress_Members_Rep_Use = [item.lower() for item in Congress_Members_Rep_Use]

In [44]:
# check all words from all our lists
data_liberal_group_words = []
data_conservative_group_words = []

for tweet in tweet_text_lst_clean:
    count_lib_words = 0
    count_cons_words = 0
    
    # check famous liberals
    for word in MostFamousDem_lst:
        if word in tweet:
            count_lib_words+=1
    # check famous conservatives
    for word in MostFamousRep_lst:
        if word in tweet:
            count_cons_words+=1
    # check liberal terms
    for word in LiberalIdentity_lst:
        if word in tweet:
            count_lib_words+=1
    # check conservative terms
    for word in ConservativeIdentity_lst:
        if word in tweet:
            count_cons_words+=1
    # check liberal congress members
    for word in Congress_Members_Dem_Use:
        if word in tweet:
            count_lib_words+=1
    # check conservative congress members
    for word in Congress_Members_Rep_Use:
        if word in tweet:
            count_cons_words+=1
            
    data_liberal_group_words.append(count_lib_words)
    data_conservative_group_words.append(count_cons_words)

In [55]:
print(len(data_liberal_group_words), len(data_conservative_group_words), len(tweet_party))

542104 542104 542104


In [45]:
data_outgroup_words = []
data_ingroup_words = []

for libword, consword, party in zip(data_liberal_group_words, data_conservative_group_words, tweet_party):
    if party == "Democrat":
        data_outgroup_words.append(consword)
        data_ingroup_words.append(libword)
    elif party == "Republican":
        data_outgroup_words.append(libword)
        data_ingroup_words.append(consword)
        

In [46]:
#print(data_outgroup_words[5000:5100])

In [59]:
data_OG["Number_Liberal_Words"] = data_liberal_group_words
data_OG["Number_Conservative_Words"] = data_conservative_group_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_OG["Number_Liberal_Words"] = data_liberal_group_words
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_OG["Number_Conservative_Words"] = data_conservative_group_words


In [60]:
data_OG

Unnamed: 0,id,author_id,created_at,referenced_tweets.retweeted.id,retweeted_user_id,text,lang,source,public_metrics.retweet_count,public_metrics.quote_count,...,fixed_urls_final2,matched_NG_domain_new,matched_Ind_domain_new,NG_score,NG_rating,Ind_acc,Ind_transp,Ind_type,Number_Liberal_Words,Number_Conservative_Words
1,1621230645610414080,211530910,2023-02-02 19:34:41,,,"Today, I introduced the JOBS Act with @RepLBR,...",en,,2,1,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,0
2,1619006992689004544,211530910,2023-01-27 16:18:41,,,Proud to announce that local high school stude...,en,,0,0,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,0
3,1618731454506369024,211530910,2023-01-26 22:03:47,,,Rural communities across the country are dedic...,en,,1,0,...,https://westerncaucus.house.gov,none,house.gov,none,none,5,3,reliable,1,0
5,1616520819672334336,211530910,2023-01-20 19:39:31,,,"Yesterday, I met with the Mahoning Valley Path...",en,,1,0,...,https://www.wfmj.com,wfmj.com,none,82.5,T,none,none,none,0,0
7,1611776007500881920,211530910,2023-01-07 17:25:20,,,I’m extremely honored and humbled to have been...,en,,1,0,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417066,114804909123178496,210926192,2011-09-16 20:56:37,,,Forbes ranks San Antonio #1 best city for jobs...,en,,1,0,...,http://www.forbes.com,forbes.com,forbes.com,100.0,T,4,3,reliable,0,0
1417071,108676425019691008,210926192,2011-08-30 23:04:13,,,August 24 report from CBO shows recovery act m...,en,,1,0,...,http://cboblog.cbo.gov,none,cbo.gov,none,none,5,3,reliable,0,0
1417073,106716497430319104,210926192,2011-08-25 13:16:10,,,CBO estimates this year deficit is $116B less ...,en,,3,0,...,https://www.cbo.gov,none,cbo.gov,none,none,5,3,reliable,0,0
1417079,96577247087034368,210926192,2011-07-28 13:46:24,,,Watch me online at http://t.co/WJc17yL with ho...,en,,2,0,...,http://live.foxnews.com,foxnews.com,foxnews.com,69.5,T,3,2,reliable,0,0


In [65]:
data_OG.to_json("Merged_Data_20230503_1.json")

Making some final fixes in the DF

In [70]:
# Opening JSON file
f = open(r'Merged_Data_20230503_1.json')
  
# returns JSON object as 
# a dictionary
data = pd.read_json(f)
  
# Closing file
f.close()

In [71]:
print(len(data))

542104


In [72]:
data_nodup = data.drop_duplicates(subset=['author_id'])

In [73]:
twitter_author_lst = data_nodup.Twitter_author_name.values.tolist()
member_name_lst = data_nodup.Member_Name_y.values.tolist()

In [75]:
HS_Total = pd.read_csv(r'HS_Total_new.csv')

In [76]:
HS_Total_nodup = HS_Total.drop_duplicates(subset=['Member_Name'])

In [77]:
member_name_lst = HS_Total_nodup.Member_Name.values.tolist()
member_chamber_lst = HS_Total_nodup.chamber.values.tolist()
member_state_lst = HS_Total_nodup.state_abbrev.values.tolist()
member_bioname_lst = HS_Total_nodup.bioname.values.tolist()
member_nom1_lst = HS_Total_nodup.nominate_dim1.values.tolist()
member_nom2_lst = HS_Total_nodup.nominate_dim2.values.tolist()
member_nommean_lst = HS_Total_nodup.nominate_geo_mean_probability.values.tolist()
member_congress_lst = HS_Total_nodup.Congress.values.tolist()
member_party_lst = HS_Total_nodup.Party.values.tolist()

In [78]:
idx_lst = [["adrian smith", 239 ],
          ["joe kennedy iii", 197],
          ["joe wilson", 351],
           ["mac thornberry press", 396],
           ["hal rogers", 170],
           ["doug collins", 123],
           ["sean patrick maloney", 275],
           ["roger williams", 388],
            ["robin kelly", 144],
           ["norma torres", 59],
           ["don young", 8],
            ["sherrod brown", 511],
          ["senator tina smith", 615],
            ["sen. grassley press", 470],
            ["tina smith", 615],
            ["jason smith", 236],
            ["pete king", 283],
            ["kathleen rice", 278],
            ["max rose", 679],
            ["mike thompson", 73],
            ["mike turner", 305],
            ["tim ryan", 306],
            ["mike levin", 625],
            ["andy kim", 670],
            ["archived: tom o'halleran", 544],
            ["vincente gonzalez", 601],
            ["bonnie watson coleman", 254],
            ["archived: u.s. rep kathleen rice", 278],
            ["mike johnson", 572],
            ["mark e. green, md", 698],
            ["dusty johnson", 696],
            ["gwen s. moore", 434],
            ["kendra horn", 686],
            ["john carter", 374],
            ["judge carter", 374],
            ["carol miller", 717],
            ["mike collins", 824],
            ["monica de la cruz", 874],
            ["anthony d'esposito", 845],
            ["congressman chuck edwards", 855],
            ["wesley hunt press office", 875],
            ["jonathan jackson", 830],
            ["rich mccormick", 825],
            ["dale w. strong", 805],
            ["tom udall press", 503],
            ["bobby scott" , 418],
            ["doug jones", 611],
            ["josh hawley", 724],
            ["Michael f.q. san nicolas", 640],
            ["mariannette miller-meeks, m.d.", 754],
            ["andy barr", 173],
            ["sanford bishop, jr.", 128],
            ["sheila jackson lee", 397],
            ["gwen moore", 434],
            ["john lewis", 115],
            ["sam graves", 229],
            ["dr. robin kelly", 144],
            ["gregory meeks", 287]]

In [79]:
# create unique lists of all these variables
for item in idx_lst:
    for i in data.index:
        if data.at[i, 'Twitter_author_name'] == item[0]:
            index = item[1]
            data.at[i, 'Member_Name_y'] = member_name_lst[index]
            data.at[i, 'chamber'] = member_chamber_lst[index]
            data.at[i, 'Party'] = member_party_lst[index]
            data.at[i, 'state_abbrev'] = member_state_lst[index]
            data.at[i, 'bioname'] = member_bioname_lst[index]
            data.at[i, 'nominate_dim1'] = member_nom1_lst[index]
            data.at[i, 'nominate_dim2'] = member_nom2_lst[index]
            data.at[i, 'nominate_geo_mean_probability'] = member_nommean_lst[index]
            data.at[i, 'Congress'] = member_congress_lst[index]
   

In [80]:
# drop party code
data_new = data.drop(["party_code"], axis=1)

In [91]:
data_new2 = data_new.drop(["Member_Name_x"], axis=1)

In [94]:
data_new3 = data_new2.drop(["HS_member_name"], axis=1)

In [98]:
data_new3.columns

Index(['id', 'author_id', 'created_at', 'referenced_tweets.retweeted.id',
       'retweeted_user_id', 'text', 'lang', 'source',
       'public_metrics.retweet_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'entities.hashtags', 'entities.urls',
       'author.username', 'author.name',
       'author.public_metrics.followers_count',
       'author.public_metrics.following_count',
       'author.public_metrics.tweet_count', 'retweeted', 'author_name',
       'Author_Name_new_x', 'Twitter_author_name', 'chamber', 'state_abbrev',
       'bioname', 'nominate_dim1', 'nominate_dim2',
       'nominate_geo_mean_probability', 'Congress', 'Party',
       'Congress_Member_Name', 'Author_Name_new_y', 'Retweeted_author_party',
       'Retweeted_author_nominate_dim1', 'Democrat_retweets',
       'Republican_retweets', 'url', 'expanded_url', 'text_clean',
       'fixed_url_thurs', 'fixed_url_fri', 'fixed_urls_final',
       'fixed_urls_final2', 'matched_NG_domain_new', 'matc

In [97]:
data_new3 = data_new3.rename(columns={'Member_Name_y': 'Congress_Member_Name'})

In [102]:
data_new4 = data_new3[['id', 'author_id', 'created_at', 'referenced_tweets.retweeted.id',
       'retweeted_user_id', 'text', 'lang', 'source',
       'public_metrics.retweet_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'entities.hashtags', 'entities.urls',
       'author.username', 'author.name',
       'author.public_metrics.followers_count',
       'author.public_metrics.following_count',
       'author.public_metrics.tweet_count', 'retweeted', 'author_name',
        'Twitter_author_name', 'chamber', 'state_abbrev',
       'bioname', 'nominate_dim1', 'nominate_dim2',
       'nominate_geo_mean_probability', 'Congress', 'Party',
       'Congress_Member_Name', 'Retweeted_author_party',
       'Retweeted_author_nominate_dim1', 'Democrat_retweets',
       'Republican_retweets', 'url', 'expanded_url', 'text_clean',
       'fixed_urls_final2', 'matched_NG_domain_new', 'matched_Ind_domain_new',
       'NG_score', 'NG_rating', 'Ind_acc', 'Ind_transp', 'Ind_type',
       'Number_Liberal_Words', 'Number_Conservative_Words']]

In [103]:
data_new4

Unnamed: 0,id,author_id,created_at,referenced_tweets.retweeted.id,retweeted_user_id,text,lang,source,public_metrics.retweet_count,public_metrics.quote_count,...,fixed_urls_final2,matched_NG_domain_new,matched_Ind_domain_new,NG_score,NG_rating,Ind_acc,Ind_transp,Ind_type,Number_Liberal_Words,Number_Conservative_Words
1,1621230645610414080,211530910,2023-02-02 19:34:41,,,"Today, I introduced the JOBS Act with @RepLBR,...",en,,2,1,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,0
2,1619006992689004544,211530910,2023-01-27 16:18:41,,,Proud to announce that local high school stude...,en,,0,0,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,0
3,1618731454506369024,211530910,2023-01-26 22:03:47,,,Rural communities across the country are dedic...,en,,1,0,...,https://westerncaucus.house.gov,none,house.gov,none,none,5,3,reliable,1,0
5,1616520819672334336,211530910,2023-01-20 19:39:31,,,"Yesterday, I met with the Mahoning Valley Path...",en,,1,0,...,https://www.wfmj.com,wfmj.com,none,82.5,T,none,none,none,0,0
7,1611776007500881920,211530910,2023-01-07 17:25:20,,,I’m extremely honored and humbled to have been...,en,,1,0,...,https://billjohnson.house.gov,none,house.gov,none,none,5,3,reliable,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417066,114804909123178496,210926192,2011-09-16 20:56:37,,,Forbes ranks San Antonio #1 best city for jobs...,en,,1,0,...,http://www.forbes.com,forbes.com,forbes.com,100.0,T,4,3,reliable,0,0
1417071,108676425019691008,210926192,2011-08-30 23:04:13,,,August 24 report from CBO shows recovery act m...,en,,1,0,...,http://cboblog.cbo.gov,none,cbo.gov,none,none,5,3,reliable,0,0
1417073,106716497430319104,210926192,2011-08-25 13:16:10,,,CBO estimates this year deficit is $116B less ...,en,,3,0,...,https://www.cbo.gov,none,cbo.gov,none,none,5,3,reliable,0,0
1417079,96577247087034368,210926192,2011-07-28 13:46:24,,,Watch me online at http://t.co/WJc17yL with ho...,en,,2,0,...,http://live.foxnews.com,foxnews.com,foxnews.com,69.5,T,3,2,reliable,0,0


In [105]:
data_new4.to_json("Merged_Data_20230504_2.json")