## Imports

In [1]:
import json, itertools, re, os, glob
import pandas as pd
import numpy as np
import networkx as nx
from collections import Counter, defaultdict
from statistics import mode

## Functions

### Utilities/Helpers

In [2]:
def check_inputs(dat, typ, cd_top):
  DATASETS = ['euro', 'timme', 'cd', 'conref']
  TYPES = ['full', 'pc']
  TOPICS = ['all', 'abortion', 'marijuana', 'gayRights', 'obama']

  if dat not in DATASETS:
    dat = 'euro'
  if typ not in TYPES:
    typ = 'full'
  if cd_top not in TOPICS:
    cd_top = 'all'
  
  return dat, typ, cd_top

In [3]:
def set_mapping(dat):
  if dat == 'euro':
    mapping = None
  elif dat == 'timme':
    mapping = None
  elif dat == 'cd':
    mapping = {-1 : 1, 1 : 2, 0 : 0}
  elif dat == 'conref':
    mapping = {'AGAINST': 1, 'FAVOR': 2, 'NONE': 0}
  return mapping

In [4]:
def set_cols(dat):
  if dat == 'euro':
    mapping = {'parent_user_screen_name': 'parent'}
  elif dat == 'timme':
    mapping = {'root': 'parent'}
  elif dat == 'cd':
    mapping = {'text': 'rawTweet', 'author': 'name', 'parent_author': 'parent'}
  elif dat == 'conref':
    mapping = None
  return mapping

In [5]:
def update_dict_app(k, v, d):
  if k not in d.keys():
    d[k] = [v]
  else:
    temp = d[k]
    temp.append(v)
    d[k] = temp

In [9]:
def filter_kws(word):
  return len(word) >= 2

In [10]:
def tokenize(text, stopwords): # adapted from InfoVGAE
  original_text = str(text).lower()
  tok = original_text.split(' ')
  text = u''
  for x in tok:
    if len(x) == 0:
        continue
    elif x[0:4] == 'http' or x[0:5] == 'https':
        continue
    elif x[0] == '@':
        continue
    elif x in stopwords:
        continue
    text = text + ' ' + x
  translate_to = u' '

  word_sep = u" ,.?:;'\"/<>`!$%^&*()-=+~[]\\|{}()\n\t" \
              + u"©℗®℠™،、⟨⟩‒–—―…„“”–――»«›‹‘’：（）！？=【】　・" \
              + u"⁄·† ‡°″¡¿÷№ºª‰¶′″‴§|‖¦⁂❧☞‽⸮◊※⁀「」﹁﹂『』﹃﹄《》―—" \
              + u"“”‘’、，一。►…¿«「」ー⋘▕▕▔▏┈⋙一ー।;!؟"
  word_sep = u'#' + word_sep
  translate_table = dict((ord(char), translate_to) for char in word_sep)
  tokens = text.translate(translate_table).split(' ')
  filtered = [word for word in tokens if filter_kws(word)]
  return ' '.join(sorted(filtered))

In [11]:
def processTweets(df, stopwords):
  new_list = []
  tweets = list(df.rawTweet)
  for tweet in tweets:
    cleaned = tokenize(tweet, stopwords)
    new_list.append(cleaned)
  df['postTweet'] = new_list
  df['n_key'] = df.postTweet.apply(lambda x: len(x.split()))
  return df

In [None]:
def filterDat(df):
    df = df[df.n_key >= 5]
    userDict = dict()
    for u in df['name'].values:
        try:
            userDict[u] += 1
        except:
            userDict[u] = 1

    pickedUsers = np.array(list(userDict.keys()))[np.where(np.array(list(userDict.values())) >= 3)]
    df = df[df['name'].isin(pickedUsers)]
    df.reset_index(drop=True, inplace=True)
    return df

In [14]:
def combine_opp(edge_list):
    edges = []
    for edge in edge_list:
        opp = (edge[1], edge[0])
        if (opp not in edges) and (edge not in edges):
            edges.append((edge[0], edge[1]))
    return edges

### Data Loaders/Builders

In [6]:
def load_cd_as_df(cd_path):
  # get names of topics
  topics = [ f.name for f in os.scandir(cd_path + 'topics/') if f.is_dir() ]
  dfs = []
  cols_m = ['topic', 'CID', 'ID', 'PID', 'stance', 'rebuttal', 'author', 'parent_author']
  cols_d = ['topic', 'CID', 'ID', 'text']
  for topic in topics:
    # get names of each file group
    topic_metas = [ file for file in glob.glob(cd_path + 'topics/' + topic + '/*.meta') ]
    topic_datas = [ file for file in glob.glob(cd_path + 'topics/' + topic + '/*.data') ]
    topic_authors = [ file for file in glob.glob(cd_path + 'authors/' + topic + '/*.author') ]
    # build a nested dictionary for the conversations, ids to author names
    author_dict = {}
    for author_list in topic_authors:
      f = open(author_list, "r", encoding='utf8')
      cid = re.search(r"(\w)(?:.author$)", author_list).group(1)
      author_dict[cid] = {}
      for line in f:
        id = re.search(r"(?:\w)(\d+)", line).group(1)
        author = re.search(r"(?:^\w\d+\s)(.+)", line).group(1)
        author_dict[cid][id] = author
    # build a df with the meta information, adding in the author and parent author names
    dfm = []
    for meta in topic_metas:
      cid = re.search(r"([A-Z])(?:\d+.meta$)", meta).group(1)
      f = open(meta, "r", encoding='utf8')
      id = f.readline()
      pid = f.readline()
      stance = f.readline()
      rebuttal = f.readline()
      id = re.search(r"(?:ID=)(.+)", id).group(1)
      pid = re.search(r"(?:PID=)(.+)", pid).group(1)
      if re.search(r"(?:Stance=)(.+)", stance):
        stance = re.search(r"(?:Stance=)(.+)", stance).group(1)
      else:
        stance = '0'
      rebuttal = re.search(r"(?:rebuttal=)(.+)", rebuttal).group(1)
      if id in author_dict[cid].keys():
        author = author_dict[cid][id]
      else:
        author = 'unknown'
      if pid != '-1':
        if pid in author_dict[cid].keys():
          parent_author = author_dict[cid][pid]
        else:
          parent_author = 'unknown'
      else:
        parent_author = 'null'
      df = pd.DataFrame([[str(topic), str(cid), int(id), int(pid), int(stance), str(rebuttal), str(author), str(parent_author)]], columns = cols_m)
      dfm.append(df)
    df1 = pd.concat(dfm)
    # build a df with the text information
    dfd = []
    for data in topic_datas:
      cid = re.search(r"([A-Z])(?:\d+.data$)", data).group(1)
      id = int(re.search(r"(?:\w)(\d+)(?:.data)", data).group(1))
      f = open(data, "r", encoding='utf8')
      text = str(f.read())
      df = pd.DataFrame([[str(topic), str(cid), int(id), str(text)]], columns = cols_d)
      dfd.append(df)
    df2 = pd.concat(dfd)
    df2.set_index(['topic', 'CID', 'ID'], inplace=True)
    # join using the shared columns of topic, cid, and id
    df_n = df1.join(df2, on=['topic', 'CID', 'ID'])
    dfs.append(df_n)
  df = pd.concat(dfs)
  df.reset_index(drop=True,inplace=True)
  return df

In [7]:
def filter_cd(df):
    replied_to = []
    for entry in df.iloc:
        topic = entry.topic
        cid = entry.CID
        pid = str(entry.PID)
        replied_to.append(topic+cid+pid)
    keep = []
    for i in range(len(df)):
        topic = df.loc[i, 'topic']
        cid = df.loc[i, 'CID']
        id = str(df.loc[i, 'ID'])
        info = topic + cid + id
        if info in replied_to:
            keep.append(i)
    df = df[df.index.isin(keep)]
    df.reset_index(inplace=True, drop=True)
    return df

In [8]:
def split_conref(df):
  data = []
  for entry in df.iloc:
    user = entry.user
    stance = entry.stance
    if not np.isnan(entry.tweet_id):
      tweet_id = entry.tweet_id
      parent = None
      text = entry.tweet_text
      data.append([user, stance, tweet_id, parent, text, 'tweet'])
    if not np.isnan(entry.tweet_quote_id):
      tweet_id = entry.tweet_quote_id
      parent = entry.tweet_quote_user
      text = entry.tweet_quote_text
      data.append([user, stance, tweet_id, parent, text, 'quote'])
    if not np.isnan(entry.rt_id):
      tweet_id = entry.rt_id
      text = entry.rt_text
      parent = re.search(r"RT @(\w+)", text).group(1)
      data.append([user, stance, tweet_id, parent, text, 'retweet'])
    if not np.isnan(entry.rt_quote_id):
      tweet_id = entry.rt_quote_id
      parent = entry.rt_quote_user
      text = entry.rt_quote_text
      data.append([user, stance, tweet_id, parent, text, 'quote'])
    if not np.isnan(entry.reply_id):
      tweet_id = entry.reply_id
      parent = None
      text = entry.reply_text
      data.append([user, stance, tweet_id, parent, text, 'reply'])
    if not np.isnan(entry.reply_to_id):
      tweet_id = entry.reply_to_id
      parent = entry.reply_to_user
      text = entry.reply_to_text
      data.append([user, stance, tweet_id, parent, text, 'reply'])
    if not np.isnan(entry.reply_to_quote_id):
      tweet_id = entry.reply_to_quote_id
      parent = entry.reply_to_quote_user
      text = entry.reply_to_quote_text
      data.append([user, stance, tweet_id, parent, text, 'reply'])
  df1 = pd.DataFrame(data, columns=['name', 'label', 'tweet_id', 'parent', 'rawTweet', 'action_type'])
  return df1

In [22]:
def load_all_tweets(file_path, wanted):
  # get names of topics
  user_tweets = [ file for file in glob.glob(file_path + 'tweets/*.txt') ]
  dfs = []
  for user in user_tweets:
    uid = int(user[18:-4])
    if uid in wanted:
      f = open(user, "r", encoding='utf8')
      tweets = []
      for line in f:
          tweets.append(line)
      df1 = pd.DataFrame(tweets, columns=['rawTweet'])
      df1['name'] = [uid] * len(df1)
      dfs.append(df1)
  df = pd.concat(dfs)
  df.reset_index(drop=True,inplace=True)
  return df

In [23]:
def add_timme_extra(df, file_path, orig_names):
    labs = pd.read_csv(file_path + 'new_dict_cleaned.csv', sep='\t')
    users = list(labs['twitter_id'])
    df1 = load_all_tweets(file_path, users)
    rt = []
    for i in range(len(df1)):
        text = df1.iloc[i,0]
        if re.search(r'RT @\w+:', text):
            rt.append(i)
    rt_only = df1[df1.index.isin(rt)]
    rt_only.reset_index(inplace=True, drop=True)
    dict_labs = {}
    mapping = {'D': 1, 'R': 2}
    for entry in labs.iloc:
        dict_labs[entry.twitter_id] = entry.party
    lab_list = []
    for entry in rt_only.iloc:
        l = dict_labs[entry['name']]
        lab_list.append(l)
    rt_only['label'] = lab_list
    rt_only['label'] = rt_only['label'].map(mapping)
    name_dict = {}
    names = rt_only['name'].unique()
    for i in range(len(names)):
        name_dict[names[i]] = 'user_' + str(i)
    rt_only['name'] = rt_only['name'].map(name_dict)
    roots = []
    for entry in rt_only.iloc:
        text = entry.rawTweet
        if re.search(r"RT @(\S+):", str(text)):
            name = re.search(r"RT @(\S+):", str(text)).group(1)
        roots.append((entry.name, name))
    rootsy = [None for i in range(len(rt_only))]
    for id, name in roots:
        rootsy[id] = name
    rt_only['root'] = rootsy
    rt = []
    for i in range(len(rt_only)):
        root = rt_only.iloc[i,3]
        if root in orig_names:
            rt.append(i)
    rt_only = rt_only[rt_only.index.isin(rt)]
    rt_only.reset_index(inplace=True, drop=True)
    return pd.concat([df, rt_only], ignore_index=True)

### Graph Build

In [13]:
def internal_build_base(df, name_dict):
  edge_list = []
  y_dict = {}
  interactions = {}
  for entry in df.iloc:
    user = name_dict[entry['name']]
    if entry.parent in name_dict.keys():
      avec = name_dict[entry.parent]
      if user != avec:
        edge_list.append((user, avec))
        if (user, avec) in interactions.keys():
          interactions[(user, avec)] += 1
        elif (avec, user) in interactions.keys():
          interactions[(avec, user)] += 1
        else:
          interactions[(user, avec)] = 1
    y = entry.label
    update_dict_app(user, y, y_dict)
  return edge_list, y_dict, interactions

In [16]:
def edges_build_base(edge_list, interactions):
  fin_edges = []
  edge_list = set(edge_list)
  edge_list = combine_opp(edge_list)
  for edge in edge_list:
    count = 0
    if (edge[0], edge[1]) in interactions.keys():
      count += interactions[(edge[0], edge[1])]
    elif (edge[1], edge[0]) in interactions.keys():
      count += interactions[(edge[1], edge[0])]
    fin_edges.append((edge[0], edge[1], {'weight' : count}))
  return fin_edges

In [18]:
def build_mapping_df_base(df, name_dict, G):
  mentions = []
  hashtags = []
  for entry in df.iloc:
      text = entry.rawTweet
      m = re.findall(r'(@\w+)', text)
      h = re.findall(r'#\w+', text)
      mentions.append(m)
      hashtags.append(h)
  df['mentions'] = mentions
  df['hashtags'] = hashtags
  keep_cols = ['name', 'rawTweet', 'postTweet', 'label', 'mentions', 'hashtags']
  mapping = df[keep_cols]
  mapping['id'] = mapping['name'].map(name_dict)
  cols = ['id', 'name', 'rawTweet', 'postTweet', 'label', 'mentions', 'hashtags']
  mapping = mapping[cols]
  nodes = list(G.nodes())
  mapping = mapping[mapping.id.isin(nodes)]
  return mapping

In [19]:
def build_graph_normal(df):
  # build child name to id dict
  names = list(df['name'].unique())
  name_dict = {}
  for i in range(len(names)):
    name_dict[names[i]] = i

  edge_list, y_dict, interactions = internal_build_base(df, name_dict)

  # build labels
  for key in sorted(y_dict.keys()):
    vals = y_dict[key]
    if len(vals) == 1:
      y_dict[key] = int(vals[0])
    elif len(set(vals)) == 1:
      y_dict[key] = int(vals[0])
    else:
      y_dict[key] = int(mode(vals))

  fin_edges = edges_build_base(edge_list, interactions)

  # build graph
  G = nx.Graph()
  G.add_edges_from(fin_edges)

  df1 = build_mapping_df_base(df, name_dict, G)
  return G, df, df1, y_dict

In [21]:
def build_bhin(df):
  # make user to index dict
  names = list(df['name'].unique())
  name_dict = {}
  for i in range(len(names)):
    name_dict[names[i]] = i

  # make tweet to index dict (starting from last idx of users)
  tweets = list(df.postTweet.unique())
  tweet_dict = {}
  last_idx = len(names) - 1
  for i in range(len(tweets)):
    tweet_dict[tweets[i]] = last_idx + 1
    last_idx += 1

  # build the edge list
  edge_list = []
  for entry in df.iloc:
    postTweet = entry["postTweet"]
    tweet_idx = tweet_dict[postTweet]
    user_name = entry["name"]
    user_idx = name_dict[user_name]
    edge_list.append((user_idx, tweet_idx))

  # make the bipartite graph from it
  H = nx.Graph()
  H.add_nodes_from(name_dict.values(), bipartite = 0)
  H.add_nodes_from(tweet_dict.values(), bipartite = 1)
  H.add_edges_from(edge_list)

  return H

### Main Code

In [24]:
def build_dataset(dat = 'euro', typ = 'pc', cd_top = 'all', filter = True, t_all=False,
                  output_path = './output', folder_path = './Datasets/'):
  """
  PARAMETERS : 
    dat : str; the dataset to be used; can be 'euro', 'timme', 'cd', or 'conref'; defaults to 'euro' if left empty or invalid string entered
    typ : str; the label set to be used; can be 'full' or 'pc'; defaults to 'pc' if left empty or invalid string entered
    cd_top : str; the topic to use if using 'cd' dataset; can be 'all', 'abortion', 'marijuana', 'gayRights', or 'obama'; defaults to 'all' if left empty or invalid string entered
    filter : boolean; whether or not to filter the users and tweets like InfoVGAE; defaults to True
    t_all : boolean; whether or not to build TIMME as P_all; defaults to False
    output_path : str; the location to output files to; defaults to './output'
    folder_path : str; the folder in which the files have been placed; defaults to './Datasets/'
  """
  # verify inputs, set mapping for labels, set mapping for column names, and make sure output path exists
  dat, typ, cd_top, act = check_inputs(dat, typ, cd_top, act)
  mapping = set_mapping(dat)
  col_change = set_cols(dat)
  if output_path == './output':
    os.makedirs(output_path, exist_ok=True)

  # load stopwords if format is graph
  if dat in ['euro', 'timme', 'cd']:
    stopword_path = './Stopwords/stopwords_en.txt'
  else:
    stopword_path = './Stopwords/stopwords_it.txt'
  stopwords = []
  with open(stopword_path, 'r') as infile:
    for word in infile.readlines():
        stopwords.append(word[:-1])

  # load dataset as a dataframe
  if dat != 'cd':
    path = folder_path + 'data_' + dat + '.csv'
    if dat in ['euro', 'timme']:
      sep = '\t'
    else:
      sep = ','
    df = pd.read_csv(path, sep=sep)
  else:
    path = folder_path + 'CreateDebate/'
    df = load_cd_as_df(path)
    if cd_top != 'all':
      df = df[df.topic == cd_top]
  
  # if conref data, split the lines properly
  if dat == 'conref':
    df = split_conref(df)

  # if using timme, load the user dictionary and add to the dataframe, then build roots
  if dat == 'timme':
    dict_file = pd.read_csv(folder_path + 'dict_timme.csv', sep='\t')
    dict_t = {}
    for entry in dict_file.iloc:
      dict_t[entry.twitter_id] = entry.twitter_name
    df['name'] = df['name'].map(dict_t)
    roots = []
    for entry in df.iloc:
      text = entry.rawTweet
      if re.search(r"RT @(\S+):", str(text)):
        name = re.search(r"RT @(\S+):", str(text)).group(1)
        roots.append((entry.name, name))
    rootsy = [None for i in range(len(df))]
    for id, name in roots:
      rootsy[id] = name
    df['root'] = rootsy
  
  # if using timme all, load the additional info and combine the dataframes
  if t_all and dat == 'timme':
    df = add_timme_extra(df, folder_path, list(df['name'].unique()))
  
  # remap labels
  if dat == 'cd':
    label_name = 'stance'
  else:
    label_name = 'label'
  if mapping is not None:
    df['label'] = df[label_name].map(mapping)
  
  # rename columns and add tweet_id column if doesn't exist
  if col_change is not None:
    df.rename(columns=col_change, inplace=True)
  if 'tweet_id' not in df.columns:
    df['tweet_id'] = df.index

  # drop any entries without labels or names or text
  df = df.dropna(axis = 0, how ='any', subset=['label', 'name', 'rawTweet'])

  # filter df if only looking at pro/con, otherwise just typecast the labels to ints
  if typ == 'pc':
    df = df.astype({'label':'int'})
    df = df[df.label != 0]
  else:
    df = df.astype({'label':'int'})
  
  df = processTweets(df, stopwords)
  df = df.dropna(axis = 0, how ='any', subset=['postTweet'])
  if filter:
    df = filterDat(df)
  
  # time to build the graph
  # build the graph based on interactions, plus the bhin for bipartite potential
  H = build_bhin(df)
  G, df, df1, y_dict = build_graph_normal(df)

  # if using createdebate, add the topic to dat for naming of files
  if dat == 'cd':
    dat = dat + cd_top
  
  # if building t_all, adjust the filename
  if dat == 'timme':
    if t_all:
      dat = dat + '_all'

  # write out files
  df.to_csv(output_path+'/'+dat+'_data.csv', sep='\t')
  df1.to_csv(output_path+'/'+dat+'_mapping.csv', sep='\t', index=False)
  nx.write_edgelist(H, output_path+'/'+dat+'_bipartite.txt')
  nx.write_weighted_edgelist(G, output_path+'/'+dat+'_graph.txt')
  with open(output_path+'/'+dat+'_ydict.json', "w") as fp:
      json.dump(y_dict,fp)

## Running

In [26]:
# options : 'euro', 'timme', 'cd', 'conref'
dat = 'euro'
# options : 'all', 'abortion', 'marijuana', 'gayRights', or 'obama'
cd_top = 'all'
# options : True or False
t_all = False
# options : True or False
filter = True
file_path = './Processed/'

build_dataset(dat=dat, cd_top=cd_top, filter=filter, t_all=t_all, output_path=file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapping['id'] = mapping['name'].map(name_dict)
