In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
pd.set_option('display.max_columns', 60)
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_pickle('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/truth_seeker_NER_emotion_stance.h5',compression='xz')

In [None]:
reverse_emotion_mapping = {'0':'anger',
                  '1':'fear',
                  '2':'disgust',
                  '3':'sadness',
                  '4':'neutral',
                  '5':'surprise',
                  '6':'joy',
                  'Con':'neutral',
                  'IV':'neutral',
                  'gross':'neutral',
                  '16':'neutral',
                  'gasp':'neutral',
                  'A':'neutral',
                  'dumb':'neutral',
                  'irony':'neutral',
                  'gasps':'neutral'

}

df['emotion'] = df['flan_emotion_pred'].map(reverse_emotion_mapping)
df['emotion'] = df['emotion'].fillna('neutral')

In [None]:
df.emotion.value_counts(dropna=False)

emotion
neutral     261191
anger        11824
disgust       8832
sadness       5066
joy           4036
fear          1646
surprise      1280
Name: count, dtype: int64

In [None]:
df.stance.value_counts(dropna=False)

stance
NEUTRAL    156921
AGAINST    126706
FAVOR       10248
Name: count, dtype: int64

In [None]:
def concat_aspect_emotion(data):
    descriptions = []

    for entry in data:
        target = entry['target']
        stance = entry['stance']
        emotion = entry['emotion']

        # Describe the aspect (target), stance, and emotion
        description = f"{emotion} emotion towards {target}"
        descriptions.append(description)

    joined_descriptions = ';'.join(descriptions)

    return joined_descriptions

def concat_aspect_stance(data):
    descriptions = []

    for entry in data:
        target = entry['target']
        stance = entry['stance']
        emotion = entry['emotion']

        # Describe the aspect (target), stance, and emotion
        description = f"{stance} stance towards {target}"
        descriptions.append(description)

    joined_descriptions = ';'.join(descriptions)

    return joined_descriptions

def concat_aspect_emotion_stance(data):
    descriptions = []

    for entry in data:
        target = entry['target']
        stance = entry['stance']  # Convert stance to lowercase
        emotion = entry['emotion'] # Convert emotion to lowercase

        # Describe the aspect (target), stance, and emotion
        description = f"{emotion} emotion & {stance} stance towards {target}"
        descriptions.append(description)

    joined_descriptions = ';'.join(descriptions)

    return joined_descriptions

In [None]:
df.columns

Index(['author', 'statement', 'BinaryNumTarget', 'tweet', 'timestamp',
       'PARENT_ID', 'TWEET_ID', 'NER', 'target_type', 'target',
       'target_filter', 'source_text', 'flan_emotion_pred', 'emotion',
       'flan_stance', 'stance'],
      dtype='object')

In [None]:
df['stance_emotion'] = df['stance'].astype(str) + '_' + df['emotion'].astype(str)

import re

#filter out targets that are just urls or emojis
def remove_urls(text):
    # Remove URLs using regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text)

    text = re.sub(r'[^@a-zA-Z0-9_\s.,;:!?()-]', '', text)

    return text

df['target_len'] = df['target'].apply(lambda x: len(x))
df['target'] =  df['target'].apply(lambda x: remove_urls(x).strip())
df[(df['target'] != '')&(df['target_len'] >1)].target.value_counts()

df['text'] =  df.tweet
df['label'] =  df.BinaryNumTarget.apply(lambda x: int(x))
df



Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,NER,target_type,target,target_filter,source_text,flan_emotion_pred,emotion,flan_stance,stance,stance_emotion,target_len,text,label
0,Jon Greenberg,"Says Donald Trump has ""changed his mind"" on ab...",1.0,"Says Donald Trump has ""changed his mind"" on ab...",Fri Apr 01 00:28:51 +0000 2016,358,358,"(Donald Trump, PERSON)",PERSON,Donald Trump,True,Classify stance towards the target:Donald Trum...,4,neutral,0,AGAINST,AGAINST_neutral,12,"Says Donald Trump has ""changed his mind"" on ab...",1
1,Jon Greenberg,"Says Donald Trump has ""changed his mind"" on ab...",1.0,@realDonaldTrump donald trump I so wanted to ...,Fri Apr 01 00:28:51 +0000 2016,358,45030,"(@realDonaldTrump donald trump , PERSON)",PERSON,@realDonaldTrump donald trump,True,Classify stance towards the target:@realDonald...,4,neutral,0,AGAINST,AGAINST_neutral,31,@realDonaldTrump donald trump I so wanted to ...,1
2,Lauren Carroll,"Says Donald Trump ""supports eminent domain"" an...",1.0,"@hilljobee Ah, so eminent domain that Trump su...",Fri Apr 01 00:52:15 +0000 2016,341,42068,"(Trump, PERSON)",PERSON,Trump,True,Classify stance towards the target:Trump in th...,4,neutral,0,AGAINST,AGAINST_neutral,5,"@hilljobee Ah, so eminent domain that Trump su...",1
3,Lauren Carroll,"Says Donald Trump ""supports eminent domain"" an...",1.0,"@hilljobee Ah, so eminent domain that Trump su...",Fri Apr 01 00:52:15 +0000 2016,341,42068,"(Heller, PERSON)",PERSON,Heller,True,Classify stance towards the target:Heller in t...,4,neutral,0,AGAINST,AGAINST_neutral,6,"@hilljobee Ah, so eminent domain that Trump su...",1
4,Lauren Carroll,"Says Donald Trump ""supports eminent domain"" an...",1.0,"@hilljobee Ah, so eminent domain that Trump su...",Fri Apr 01 00:52:15 +0000 2016,341,42068,"(Cruz, PERSON)",PERSON,Cruz,True,Classify stance towards the target:Cruz in the...,4,neutral,1,NEUTRAL,NEUTRAL_neutral,4,"@hilljobee Ah, so eminent domain that Trump su...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293870,Sean Gorman,"Says President Barack Obama ""will not utter th...",1.0,Remember when Republicans hammered Obama becau...,Wed Sep 30 23:16:49 +0000 2020,330,39776,"(Trump, PERSON)",PERSON,Trump,True,Classify stance towards the target:Trump in th...,4,neutral,0,AGAINST,AGAINST_neutral,5,Remember when Republicans hammered Obama becau...,1
293871,Sue Owen,"""Texas has the highest rate of uninsured in th...",1.0,@JohnCornyn Texas has the highest rate of unin...,Wed Sep 30 23:44:29 +0000 2020,539,70635,"(Texas, GPE)",GPE,Texas,True,Classify stance towards the target:Texas in th...,4,neutral,0,AGAINST,AGAINST_neutral,5,@JohnCornyn Texas has the highest rate of unin...,1
293872,Jon Greenberg,Says President Barack Obama spied on my campai...,0.0,"Corrupt Joe Biden and Obama, who spied on my c...",Wed Sep 30 23:55:01 +0000 2020,988,125850,"(Joe Biden, PERSON)",PERSON,Joe Biden,True,Classify stance towards the target:Joe Biden i...,4,neutral,0,AGAINST,AGAINST_neutral,9,"Corrupt Joe Biden and Obama, who spied on my c...",0
293873,Jon Greenberg,Says President Barack Obama spied on my campai...,0.0,"Corrupt Joe Biden and Obama, who spied on my c...",Wed Sep 30 23:55:01 +0000 2020,988,125850,"(Obama, PERSON)",PERSON,Obama,True,Classify stance towards the target:Obama in th...,4,neutral,0,AGAINST,AGAINST_neutral,5,"Corrupt Joe Biden and Obama, who spied on my c...",0


In [None]:
def group_targets(data):
  df  = data
  # Fill NaN values in the 'target' column with an empty string
  df['target'] = df['target'].fillna('')

  df['dict_target_emotion_stance'] = df.apply(lambda row: {'target': row['target'], 'emotion': row['emotion'], 'stance': row['stance']}, axis=1)

  new_df = []

  for txt in tqdm(df.TWEET_ID.unique()):
    _df = df[df.TWEET_ID == txt]
    entries = _df['dict_target_emotion_stance'].tolist()

    _df = {'text':_df.text.tolist()[0],
          'label':_df.label.tolist()[0],
          #'src':_df.src.tolist()[0],
           #'tweet_user': _df.tweet_user.tolist()[0],
           'tweet_id':  _df.TWEET_ID.tolist()[0],
           #'depth':_df.depth.tolist()[0],
           #'parent_user':_df.parent_user.tolist()[0],
           'create_date':_df.timestamp.tolist()[0],
           'parent_id':_df.PARENT_ID.tolist()[0],

          'dict_target_emotion_stance':entries}
    new_df.append(_df)

  df = pd.DataFrame(new_df)
  # Function to filter rows based on the criteria
  def process_target_list(target_list):
      if len(target_list) > 1:
          # Keep only one dictionary if length is greater than 1
          non_empty_targets = [entry for entry in target_list if entry['target'] != '']
          return non_empty_targets
      else:
          # If length is 1, do nothing
          return target_list
  df['dict_target_emotion_stance'] = df['dict_target_emotion_stance'].apply(process_target_list)


  df['target_emotion_stance'] = df['dict_target_emotion_stance'].apply(lambda x: concat_aspect_emotion_stance(x))
  df['target_emotion'] = df['dict_target_emotion_stance'].apply(lambda x: concat_aspect_emotion(x))
  df['target_stance'] = df['dict_target_emotion_stance'].apply(lambda x: concat_aspect_stance(x))
  return df



# Apply the filtering function to the DataFrame
df2 = group_targets(df)
df2

100%|██████████| 122786/122786 [02:32<00:00, 803.39it/s]


Unnamed: 0,text,label,tweet_id,create_date,parent_id,dict_target_emotion_stance,target_emotion_stance,target_emotion,target_stance
0,"Says Donald Trump has ""changed his mind"" on ab...",1,358,Fri Apr 01 00:28:51 +0000 2016,358,"[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & AGAINST stance towards Donal...,neutral emotion towards Donald Trump,AGAINST stance towards Donald Trump
1,@realDonaldTrump donald trump I so wanted to ...,1,45030,Fri Apr 01 00:28:51 +0000 2016,358,"[{'target': '@realDonaldTrump donald trump', '...",neutral emotion & AGAINST stance towards @real...,neutral emotion towards @realDonaldTrump donal...,AGAINST stance towards @realDonaldTrump donald...
2,"@hilljobee Ah, so eminent domain that Trump su...",1,42068,Fri Apr 01 00:52:15 +0000 2016,341,"[{'target': 'Trump', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Trump...,neutral emotion towards Trump;neutral emotion ...,AGAINST stance towards Trump;AGAINST stance to...
3,"Says Donald Trump ""supports eminent domain"" an...",1,341,Fri Apr 01 00:52:15 +0000 2016,341,"[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & NEUTRAL stance towards Donal...,neutral emotion towards Donald Trump;neutral e...,NEUTRAL stance towards Donald Trump;NEUTRAL st...
4,Paid family leave for New York kicks in in 2018.,1,16026,Fri Apr 01 01:36:18 +0000 2016,122,"[{'target': 'New York', 'emotion': 'neutral', ...",neutral emotion & NEUTRAL stance towards New York,neutral emotion towards New York,NEUTRAL stance towards New York
...,...,...,...,...,...,...,...,...,...
122781,@Kokomothegreat TWO VOTES PER PRECINCT....I HA...,1,6642,Wed Sep 30 23:16:07 +0000 2020,26,"[{'target': 'TWEETS', 'emotion': 'neutral', 's...",neutral emotion & NEUTRAL stance towards TWEETS,neutral emotion towards TWEETS,NEUTRAL stance towards TWEETS
122782,Remember when Republicans hammered Obama becau...,1,39776,Wed Sep 30 23:16:49 +0000 2020,330,"[{'target': 'Obama', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Obama...,neutral emotion towards Obama;neutral emotion ...,AGAINST stance towards Obama;AGAINST stance to...
122783,@JohnCornyn Texas has the highest rate of unin...,1,70635,Wed Sep 30 23:44:29 +0000 2020,539,"[{'target': 'Texas', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Texas,neutral emotion towards Texas,AGAINST stance towards Texas
122784,"Corrupt Joe Biden and Obama, who spied on my c...",0,125850,Wed Sep 30 23:55:01 +0000 2020,988,"[{'target': 'Joe Biden', 'emotion': 'neutral',...",neutral emotion & AGAINST stance towards Joe B...,neutral emotion towards Joe Biden;neutral emot...,AGAINST stance towards Joe Biden;AGAINST stanc...


In [None]:
'''removing duplicate targets'''
def collapse_duplicates(data):
    # Iterate through each row
    for i in tqdm(range(len(data))):
        # Get the list of dictionaries for the current row
        entries = data[i]

        # Use a set to keep track of unique entries
        unique_entries = set()

        # Create a new list to store the non-duplicate dictionaries
        unique_list = []

        try:
          # Iterate through each dictionary in the original list
          for entry in entries:
              # Convert the dictionary to a tuple and check for uniqueness
              entry_tuple = tuple(entry.items())

              # Check if the current entry is unique
              if entry_tuple not in unique_entries:
                  unique_entries.add(entry_tuple)
                  unique_list.append(entry)

          # Update the 'dict_target_emotion_stance' column with the new list
          data[i] = unique_list
        except AttributeError:
          data[i] = entries
        except TypeError:
          data[i] = entries

    return data

# Example usage with your DataFrame
df2['dict_target_emotion_stance'] = collapse_duplicates(df2['dict_target_emotion_stance'])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = unique_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = unique_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = unique_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = unique_list
A value is tryi

In [None]:
df2.sort_values(by='create_date', inplace=True)

In [None]:
df2

Unnamed: 0,text,label,tweet_id,create_date,parent_id,dict_target_emotion_stance,target_emotion_stance,target_emotion,target_stance
0,"Says Donald Trump has ""changed his mind"" on ab...",1,358,Fri Apr 01 00:28:51 +0000 2016,358,"[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & AGAINST stance towards Donal...,neutral emotion towards Donald Trump,AGAINST stance towards Donald Trump
1,@realDonaldTrump donald trump I so wanted to ...,1,45030,Fri Apr 01 00:28:51 +0000 2016,358,"[{'target': '@realDonaldTrump donald trump', '...",neutral emotion & AGAINST stance towards @real...,neutral emotion towards @realDonaldTrump donal...,AGAINST stance towards @realDonaldTrump donald...
2,"@hilljobee Ah, so eminent domain that Trump su...",1,42068,Fri Apr 01 00:52:15 +0000 2016,341,"[{'target': 'Trump', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Trump...,neutral emotion towards Trump;neutral emotion ...,AGAINST stance towards Trump;AGAINST stance to...
3,"Says Donald Trump ""supports eminent domain"" an...",1,341,Fri Apr 01 00:52:15 +0000 2016,341,"[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & NEUTRAL stance towards Donal...,neutral emotion towards Donald Trump;neutral e...,NEUTRAL stance towards Donald Trump;NEUTRAL st...
4,Paid family leave for New York kicks in in 2018.,1,16026,Fri Apr 01 01:36:18 +0000 2016,122,"[{'target': 'New York', 'emotion': 'neutral', ...",neutral emotion & NEUTRAL stance towards New York,neutral emotion towards New York,NEUTRAL stance towards New York
...,...,...,...,...,...,...,...,...,...
122781,@Kokomothegreat TWO VOTES PER PRECINCT....I HA...,1,6642,Wed Sep 30 23:16:07 +0000 2020,26,"[{'target': 'TWEETS', 'emotion': 'neutral', 's...",neutral emotion & NEUTRAL stance towards TWEETS,neutral emotion towards TWEETS,NEUTRAL stance towards TWEETS
122782,Remember when Republicans hammered Obama becau...,1,39776,Wed Sep 30 23:16:49 +0000 2020,330,"[{'target': 'Obama', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Obama...,neutral emotion towards Obama;neutral emotion ...,AGAINST stance towards Obama;AGAINST stance to...
122783,@JohnCornyn Texas has the highest rate of unin...,1,70635,Wed Sep 30 23:44:29 +0000 2020,539,"[{'target': 'Texas', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Texas,neutral emotion towards Texas,AGAINST stance towards Texas
122784,"Corrupt Joe Biden and Obama, who spied on my c...",0,125850,Wed Sep 30 23:55:01 +0000 2020,988,"[{'target': 'Joe Biden', 'emotion': 'neutral',...",neutral emotion & AGAINST stance towards Joe B...,neutral emotion towards Joe Biden;neutral emot...,AGAINST stance towards Joe Biden;AGAINST stanc...


In [None]:
_df = pd.read_excel('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/Truth_Seeker_Model_Dataset_With_TimeStamps 1.xlsx')
_df.BinaryNumTarget.value_counts(dropna=False)

BinaryNumTarget
1.0    68930
0.0    65268
NaN        5
Name: count, dtype: int64

In [None]:
_df[_df.BinaryNumTarget.isna()]

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,timestamp
22779,Trump admin hypocritical on States' Rights vis...,Agree,Mon Feb 27 17:12:53 +0000 2017,,,,,,,
33370,Our New Hampshire poll is finding real anger t...,Agree,Sun Apr 21 16:34:25 +0000 2013,,,,,,,
83724,because of gun control: 56,,,,,,,,,
83725,million.,Disagree,Mon Sep 21 01:21:40 +0000 2009,,,,,,,
93242,#FLDebate,Agree,Tue Jan 24 03:14:34 +0000 2012,,,,,,,


In [None]:
_df.iloc[93241:93243]

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,timestamp
93241,93237,W. Gardner,Says Barack Obama has played over 90 rounds of...,1.0,1.0,"Obama, 90 rounds of golf",RT @GOPrincess: Romney: Obama plays 90 rounds...,,,
93242,#FLDebate,Agree,Tue Jan 24 03:14:34 +0000 2012,,,,,,,


In [None]:
_df = pd.read_excel('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/Truth_Seeker_Model_Dataset_With_TimeStamps 1.xlsx').dropna()
_df

Unnamed: 0.1,Unnamed: 0,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,3_label_majority_answer,timestamp
0,0,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Mostly Agree,Agree,Thu Sep 09 23:58:53 +0000 2021
1,1,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,NO MAJORITY,Agree,Mon Aug 30 18:58:09 +0000 2021
2,2,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",THE SUPREME COURT is siding with super rich pr...,Agree,Agree,Fri Aug 27 09:53:44 +0000 2021
3,3,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@POTUS Biden Blunders\n\nBroken campaign promi...,Mostly Agree,Agree,Tue Oct 05 20:37:14 +0000 2021
4,4,D.L. Davis,End of eviction moratorium means millions of A...,1.0,1.0,"Americans, eviction moratorium",@OhComfy I agree. The confluence of events rig...,Agree,Agree,Fri Aug 27 10:58:24 +0000 2021
...,...,...,...,...,...,...,...,...,...,...
134198,134193,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner",Joe Biden's family owned African slaves....\n\...,Mostly Agree,Agree,Mon Jun 22 15:02:31 +0000 2020
134199,134194,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner","Joe Bidens great, great grandfather was a slav...",Agree,Agree,Mon Oct 12 15:52:02 +0000 2020
134200,134195,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner","@ChevyChaseToGo ""Joe Bidens great-grandfather ...",Mostly Agree,Agree,Fri Oct 16 21:02:49 +0000 2020
134201,134196,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,0.0,"Biden, great grandfather, slave owner",@JoeBiden Facts are Bidens VP Kamala Harris Gr...,NO MAJORITY,Agree,Thu Jun 17 20:30:22 +0000 2021


In [None]:
_df.target.value_counts()

target
1.0    68926
0.0    65267
Name: count, dtype: int64

In [None]:
_df = pd.read_excel('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/Truth_Seeker_Model_Dataset_With_TimeStamps 1.xlsx').dropna()

# Creating unique identifiers for author-statement pairs
author_statement = _df['author'] + "|" + _df['statement']
_df['PARENT_ID'] = pd.factorize(author_statement)[0]

_df['PARENT_ID'] = _df['PARENT_ID'] + 1

# Calculate starting point for Tweet_IDs to avoid overlap
tweet_id_start = _df['PARENT_ID'].max() + 1

# Assign unique identifiers for tweets, handling NaN values and ensuring no overlap
_df['TWEET_ID'] = [i + tweet_id_start for i in range(len(_df))]
_df['root_node']= _df['PARENT_ID']

_df = _df[['author', 'statement', 'BinaryNumTarget', 'tweet', 'timestamp', 'PARENT_ID', 'TWEET_ID','root_node']]
display(_df)

_df['timestamp'] = pd.to_datetime(_df['timestamp'], format='%a %b %d %H:%M:%S %z %Y').dt.tz_localize(None)

#given that it is a star-like graph without retweets,
origin = _df[['author', 'statement',  'BinaryNumTarget', 'PARENT_ID']].drop_duplicates(['author','statement'])
origin['TWEET_ID']= origin['PARENT_ID']
origin['tweet']= origin['statement']

#here I will assume that the earliest timestamp for each PARENT_ID in the main DataFrame is the parentID time stamp
earliest_timestamps = _df.groupby('PARENT_ID')['timestamp'].min().reset_index()

# Merge the earliest timestamp with the origin DataFrame on PARENT_ID
origin = origin.merge(earliest_timestamps, on='PARENT_ID', how='left')
origin['root_node']= origin['PARENT_ID']
origin['PARENT_ID']= 0
origin

_df = pd.concat([_df, origin], ignore_index=True).drop_duplicates()
_df = _df.sort_values(by='timestamp')

_df['text'] = _df['tweet']

Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node
0,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@POTUS Biden Blunders - 6 Month Update\n\nInfl...,Thu Sep 09 23:58:53 +0000 2021,1,1059,1
1,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@S0SickRick @Stairmaster_ @6d6f636869 Not as m...,Mon Aug 30 18:58:09 +0000 2021,1,1060,1
2,D.L. Davis,End of eviction moratorium means millions of A...,1.0,THE SUPREME COURT is siding with super rich pr...,Fri Aug 27 09:53:44 +0000 2021,1,1061,1
3,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@POTUS Biden Blunders\n\nBroken campaign promi...,Tue Oct 05 20:37:14 +0000 2021,1,1062,1
4,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@OhComfy I agree. The confluence of events rig...,Fri Aug 27 10:58:24 +0000 2021,1,1063,1
...,...,...,...,...,...,...,...,...
134198,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,Joe Biden's family owned African slaves....\n\...,Mon Jun 22 15:02:31 +0000 2020,1058,135247,1058
134199,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,"Joe Bidens great, great grandfather was a slav...",Mon Oct 12 15:52:02 +0000 2020,1058,135248,1058
134200,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,"@ChevyChaseToGo ""Joe Bidens great-grandfather ...",Fri Oct 16 21:02:49 +0000 2020,1058,135249,1058
134201,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,@JoeBiden Facts are Bidens VP Kamala Harris Gr...,Thu Jun 17 20:30:22 +0000 2021,1058,135250,1058


In [None]:
earliest_timestamps

Unnamed: 0,PARENT_ID,timestamp
0,1,2021-08-03 01:02:45
1,2,2021-08-16 18:40:57
2,3,2009-06-06 02:26:50
3,4,2020-04-23 22:37:02
4,5,2020-01-05 18:20:26
...,...,...
1053,1054,2020-06-26 18:54:55
1054,1055,2021-01-10 18:12:05
1055,1056,2020-05-21 20:55:57
1056,1057,2011-04-08 02:09:01


In [None]:
origin.BinaryNumTarget.value_counts()

BinaryNumTarget
1.0    579
0.0    479
Name: count, dtype: int64

In [None]:
df3 = df2[['text','dict_target_emotion_stance','target_emotion_stance']]
df4 = pd.merge(_df,df3,on='text',how='left')
df4['parent_id'] = df4['PARENT_ID']
df4['tweet_id'] = df4['TWEET_ID']
df4['create_date'] = df4['timestamp']
df4['label'] = df4['BinaryNumTarget'].apply(lambda x: int(x))
df4 = df4.dropna()
df4

Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node,text,dict_target_emotion_stance,target_emotion_stance,parent_id,tweet_id,create_date,label
2,Katie Sanders,Says Mike Huckabee appeared in diabetes infome...,1.0,Says Mike Huckabee appeared in diabetes infome...,2008-05-05 15:20:48,0,378,378,Says Mike Huckabee appeared in diabetes infome...,"[{'target': 'Mike Huckabee', 'emotion': 'neutr...",neutral emotion & AGAINST stance towards Mike ...,0,378,2008-05-05 15:20:48,1
3,Katie Sanders,Says Mike Huckabee appeared in diabetes infome...,1.0,I might get an interview with Mike Huckabee ab...,2008-05-05 15:20:48,378,46516,378,I might get an interview with Mike Huckabee ab...,"[{'target': 'Mike Huckabee', 'emotion': 'neutr...",neutral emotion & FAVOR stance towards Mike Hu...,378,46516,2008-05-05 15:20:48,1
4,Katie Sanders,Says Mike Huckabee appeared in diabetes infome...,1.0,@dporter THANKS! I may get to interview Mike H...,2008-05-06 12:27:06,378,46541,378,@dporter THANKS! I may get to interview Mike H...,"[{'target': '@dporter', 'emotion': 'neutral', ...",neutral emotion & FAVOR stance towards @dporte...,378,46541,2008-05-06 12:27:06,1
7,W. Gardner,"""The majority of Austinites rent"" the places t...",1.0,austinites - we're coming to austin at the end...,2008-09-18 17:06:07,498,61417,498,austinites - we're coming to austin at the end...,"[{'target': 'austin', 'emotion': 'neutral', 's...",neutral emotion & NEUTRAL stance towards austin,498,61417,2008-09-18 17:06:07,1
9,Mica Soellner,"""The law says that mental health must be treat...",1.0,pages upon pages of provision to require insur...,2008-10-01 17:11:31,88,13397,88,pages upon pages of provision to require insur...,"[{'target': '', 'emotion': 'neutral', 'stance'...",neutral emotion & NEUTRAL stance towards,88,13397,2008-10-01 17:11:31,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135246,Patrick Orsagos,The United Nations new world order agenda will...,0.0,Countries should be wanting to leave the Unite...,2022-02-28 22:17:26,1033,133014,1033,Countries should be wanting to leave the Unite...,"[{'target': 'the United Nations', 'emotion': '...",neutral emotion & AGAINST stance towards the U...,1033,133014,2022-02-28 22:17:26,0
135247,Ciara O'Rourke,Herd immunity has been reached.,0.0,@ragedcan @LBJ20Trey @duncTTV @KDTreeey5 @TheN...,2022-02-28 22:22:27,1005,127838,1005,@ragedcan @LBJ20Trey @duncTTV @KDTreeey5 @TheN...,"[{'target': '@LBJ20Trey', 'emotion': 'anger', ...",anger emotion & NEUTRAL stance towards @LBJ20T...,1005,127838,2022-02-28 22:22:27,0
135248,Patrick Orsagos,The United Nations new world order agenda will...,0.0,"... avoided, and prevented, and you spare the ...",2022-02-28 22:32:47,1033,132719,1033,"... avoided, and prevented, and you spare the ...","[{'target': 'the united nations', 'emotion': '...",neutral emotion & AGAINST stance towards the u...,1033,132719,2022-02-28 22:32:47,0
135249,Patrick Orsagos,The United Nations new world order agenda will...,0.0,"... control. \n\nAt that point, KNOW AND UNDER...",2022-02-28 22:34:36,1033,133156,1033,"... control. \n\nAt that point, KNOW AND UNDER...","[{'target': 'the united nations', 'emotion': '...",neutral emotion & AGAINST stance towards the u...,1033,133156,2022-02-28 22:34:36,0


In [None]:
df4.label.value_counts()

label
1    62150
0    60654
Name: count, dtype: int64

In [None]:
df4.dropna().label.value_counts()

label
1    62150
0    60654
Name: count, dtype: int64

In [None]:
root_nodes = list(set(df4[df4['parent_id'] == 0].root_node))

_df = []
for i in tqdm(range(len(root_nodes))):
    text_df_subset = df4[df4.root_node == root_nodes[i]].copy()
    root_time = text_df_subset[text_df_subset['parent_id'] == 0]['create_date']
    text_df_subset['time_elapsed'] = text_df_subset['create_date'].apply(lambda x: x - root_time.values[0])
    text_df_subset['time_elapsed'] = text_df_subset['time_elapsed'].dt.total_seconds() / 60
    text_df_subset['time_elapsed'] = text_df_subset['time_elapsed'].apply(lambda x: max(x, 0))
    _df.append(text_df_subset)
df4 = pd.concat(_df)
df4

100%|██████████| 877/877 [00:06<00:00, 143.56it/s]


Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node,text,dict_target_emotion_stance,target_emotion_stance,parent_id,tweet_id,create_date,label,time_elapsed
103844,Miriam Valverde,"The Trump administration worked to free 5,000 ...",1.0,@mikepompeo No Mike Pompeo you and the Trump A...,2021-08-16 18:40:57,2,1720,2,@mikepompeo No Mike Pompeo you and the Trump A...,"[{'target': 'Mike Pompeo', 'emotion': 'neutral...",neutral emotion & AGAINST stance towards Mike ...,2,1720,2021-08-16 18:40:57,1,0.000000e+00
103845,Miriam Valverde,"The Trump administration worked to free 5,000 ...",1.0,"The Trump administration worked to free 5,000 ...",2021-08-16 18:40:57,0,2,2,"The Trump administration worked to free 5,000 ...","[{'target': 'Trump', 'emotion': 'neutral', 'st...",neutral emotion & FAVOR stance towards Trump;n...,0,2,2021-08-16 18:40:57,1,0.000000e+00
103851,Miriam Valverde,"The Trump administration worked to free 5,000 ...",1.0,"@danzu72 @chipfranklin I agree, but I would ad...",2021-08-16 19:08:49,2,1872,2,"@danzu72 @chipfranklin I agree, but I would ad...","[{'target': 'the Trump Administration', 'emoti...",neutral emotion & AGAINST stance towards the T...,2,1872,2021-08-16 19:08:49,1,2.786667e+01
103852,Miriam Valverde,"The Trump administration worked to free 5,000 ...",1.0,@LindseyGrahamSC @FoxNews So you can lie about...,2021-08-16 19:20:51,2,1571,2,@LindseyGrahamSC @FoxNews So you can lie about...,"[{'target': '@LindseyGrahamSC @FoxNews', 'emot...",neutral emotion & NEUTRAL stance towards @Lind...,2,1571,2021-08-16 19:20:51,1,3.990000e+01
103855,Miriam Valverde,"The Trump administration worked to free 5,000 ...",1.0,@EliseStefanik A disgrace.\n\nWhy did the Trum...,2021-08-16 19:26:28,2,1566,2,@EliseStefanik A disgrace.\n\nWhy did the Trum...,"[{'target': 'Trump Administration', 'emotion':...",neutral emotion & AGAINST stance towards Trump...,2,1566,2021-08-16 19:26:28,1,4.551667e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99368,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,@LionelMedia #Democrats want to bring back old...,2021-07-18 10:53:02,1058,135197,1058,@LionelMedia #Democrats want to bring back old...,"[{'target': 'Biden', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Biden...,1058,135197,2021-07-18 10:53:02,0,1.075596e+06
113462,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,Did you know that one of Harris's Great Grandf...,2021-10-10 12:55:21,1058,135201,1058,Did you know that one of Harris's Great Grandf...,"[{'target': 'Harris', 'emotion': 'neutral', 's...",neutral emotion & AGAINST stance towards Harri...,1058,135201,2021-10-10 12:55:21,0,1.196678e+06
129702,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,@ElieNYC Is the great granddaughter of freed s...,2022-01-14 20:21:52,1058,135215,1058,@ElieNYC Is the great granddaughter of freed s...,"[{'target': '@ElieNYC', 'emotion': 'neutral', ...",neutral emotion & NEUTRAL stance towards @Elie...,1058,135215,2022-01-14 20:21:52,0,1.335364e+06
129765,Tom Kertscher,Joe Bidens great-grandfather Joseph J. Biden w...,0.0,@Federalist_10 @brianstelter @POTUS @SpeakerPe...,2022-01-15 05:50:07,1058,135164,1058,@Federalist_10 @brianstelter @POTUS @SpeakerPe...,"[{'target': 'the White House', 'emotion': 'neu...",neutral emotion & NEUTRAL stance towards the W...,1058,135164,2022-01-15 05:50:07,0,1.335933e+06


In [None]:
df4[df4.parent_id==0].time_elapsed.value_counts()

time_elapsed
0.0    877
Name: count, dtype: int64

In [None]:
df4[(df4.parent_id==0)&(df4.time_elapsed==34774.5)]

Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node,text,dict_target_emotion_stance,target_emotion_stance,parent_id,tweet_id,create_date,label,time_elapsed
105839,D.L. Davis,End of eviction moratorium means millions of A...,1.0,The Trump=elected United States Supreme Court ...,2021-08-27 04:37:15,0,1445,0,The Trump=elected United States Supreme Court ...,"[{'target': 'Trump', 'emotion': 'sadness', 'st...",sadness emotion & NEUTRAL stance towards Trump...,0,1445,2021-08-27 04:37:15,1,34774.5


In [None]:
df4[(df4.parent_id==0)&(df4.time_elapsed==34774.5)]

In [None]:
df4[df4.root_node==0].sort_values('create_date')

Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node,text,dict_target_emotion_stance,target_emotion_stance,parent_id,tweet_id,create_date,label,time_elapsed
101747,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@CoriBush You are a leader. I just wanted to s...,2021-08-03 01:02:45,0,1176,0,@CoriBush You are a leader. I just wanted to s...,"[{'target': '@CoriBush', 'emotion': 'neutral',...",neutral emotion & FAVOR stance towards @CoriBu...,0,1176,2021-08-03 01:02:45,1,0.000000
101748,D.L. Davis,End of eviction moratorium means millions of A...,1.0,Why does @JoeBiden hate Americans? He wont act...,2021-08-03 01:02:56,0,1259,0,Why does @JoeBiden hate Americans? He wont act...,"[{'target': '@JoeBiden', 'emotion': 'anger', '...",anger emotion & AGAINST stance towards @JoeBid...,0,1259,2021-08-03 01:02:56,1,0.183333
101749,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@gregkellyusa @CoriBush She's sleeping on the ...,2021-08-03 01:03:47,0,1193,0,@gregkellyusa @CoriBush She's sleeping on the ...,"[{'target': '@CoriBush', 'emotion': 'disgust',...",disgust emotion & AGAINST stance towards @Cori...,0,1193,2021-08-03 01:03:47,1,1.033333
101754,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@aplemkseriously @CoriBush @CoriBush knew July...,2021-08-03 01:58:38,0,1069,0,@aplemkseriously @CoriBush @CoriBush knew July...,"[{'target': '@CoriBush', 'emotion': 'neutral',...",neutral emotion & FAVOR stance towards @CoriBu...,0,1069,2021-08-03 01:58:38,1,55.883333
101759,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@BillFOXLA @FoxNews I love how the only issue ...,2021-08-03 02:30:31,0,1524,0,@BillFOXLA @FoxNews I love how the only issue ...,"[{'target': '@BillFOXLA @FoxNews', 'emotion': ...",neutral emotion & NEUTRAL stance towards @Bill...,0,1524,2021-08-03 02:30:31,1,87.766667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119477,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@anthonyzenkus Eviction moratorium destroyed s...,2021-11-16 12:56:56,0,1077,0,@anthonyzenkus Eviction moratorium destroyed s...,"[{'target': '@anthonyzenkus Eviction', 'emotio...",neutral emotion & AGAINST stance towards @anth...,0,1077,2021-11-16 12:56:56,1,151914.183333
119585,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@Politics_Polls @ABC @washingtonpost Oh ffs w ...,2021-11-17 01:52:50,0,1271,0,@Politics_Polls @ABC @washingtonpost Oh ffs w ...,[{'target': '@Politics_Polls @ABC @washingtonp...,anger emotion & AGAINST stance towards @Politi...,0,1271,2021-11-17 01:52:50,1,152690.083333
120413,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@Josiahhaken @hardlynormal @CityRelief_ Unthin...,2021-11-21 02:33:00,0,1153,0,@Josiahhaken @hardlynormal @CityRelief_ Unthin...,"[{'target': '@Josiahhaken @hardlynormal', 'emo...",neutral emotion & NEUTRAL stance towards @Josi...,0,1153,2021-11-21 02:33:00,1,158490.250000
120550,D.L. Davis,End of eviction moratorium means millions of A...,1.0,@TheDetailConsp1 @Skrrt__Vonnegut @JeremyWard3...,2021-11-22 00:34:18,0,1340,0,@TheDetailConsp1 @Skrrt__Vonnegut @JeremyWard3...,"[{'target': 'Afghanistan', 'emotion': 'neutral...",neutral emotion & NEUTRAL stance towards Afgha...,0,1340,2021-11-22 00:34:18,1,159811.550000


In [None]:
df4[df4.dict_target_emotion_stance.isna()]

Unnamed: 0,author,statement,BinaryNumTarget,tweet,timestamp,PARENT_ID,TWEET_ID,root_node,text,dict_target_emotion_stance,target_emotion_stance,parent_id,tweet_id,create_date,label


In [None]:
df4[df4.PARENT_ID==0].label.value_counts()

label
1    831
0    431
Name: count, dtype: int64

In [None]:
pd.read_pickle('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/truth_seeker_NER_clean.h5',compression='xz')

Unnamed: 0,text,dict_target_emotion_stance,target_emotion_stance,create_date,parent_id,tweet_id,root_node,label
0,"Says Donald Trump has ""changed his mind"" on ab...","[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & AGAINST stance towards Donal...,Fri Apr 01 00:28:51 +0000 2016,0,359,359,1
1,@realDonaldTrump donald trump I so wanted to ...,"[{'target': '@realDonaldTrump donald trump', '...",neutral emotion & AGAINST stance towards @real...,Fri Apr 01 00:28:51 +0000 2016,359,45031,359,1
2,"@hilljobee Ah, so eminent domain that Trump su...","[{'target': 'Trump', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Trump...,Fri Apr 01 00:52:15 +0000 2016,342,42069,342,1
3,"Says Donald Trump ""supports eminent domain"" an...","[{'target': 'Donald Trump', 'emotion': 'neutra...",neutral emotion & NEUTRAL stance towards Donal...,Fri Apr 01 00:52:15 +0000 2016,0,342,342,1
4,Paid family leave for New York kicks in in 2018.,"[{'target': 'New York', 'emotion': 'neutral', ...",neutral emotion & NEUTRAL stance towards New York,Fri Apr 01 01:36:18 +0000 2016,123,16027,123,1
...,...,...,...,...,...,...,...,...
122799,@Kokomothegreat TWO VOTES PER PRECINCT....I HA...,"[{'target': 'TWEETS', 'emotion': 'neutral', 's...",neutral emotion & NEUTRAL stance towards TWEETS,Wed Sep 30 23:16:07 +0000 2020,27,6643,27,1
122800,Remember when Republicans hammered Obama becau...,"[{'target': 'Obama', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Obama...,Wed Sep 30 23:16:49 +0000 2020,331,39777,331,1
122801,@JohnCornyn Texas has the highest rate of unin...,"[{'target': 'Texas', 'emotion': 'neutral', 'st...",neutral emotion & AGAINST stance towards Texas,Wed Sep 30 23:44:29 +0000 2020,540,70636,540,1
122802,"Corrupt Joe Biden and Obama, who spied on my c...","[{'target': 'Joe Biden', 'emotion': 'neutral',...",neutral emotion & AGAINST stance towards Joe B...,Wed Sep 30 23:55:01 +0000 2020,989,125851,989,0


In [None]:
df4.to_pickle('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/truth_seeker_NER_clean2.h5',compression='xz')

In [None]:
# df2.to_pickle('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/truth_seeker_NER_clean.h5',compression='xz')

In [None]:
#df.to_pickle('/content/drive/MyDrive/QE/Study2/TruthSeeker2023/truth_seeker_NER_emotion_stance.h5',compression='xz')