In [1]:
import sys
import glob
import numpy as np
import pandas as pd
import random
from collections import *
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [3]:
sys.path.append('/home/kalkiek/projects/reddit-political-affiliation/')
from src.features.interactions.political_comment import PoliticalComment
#from src.data.date_helper import read_submissions
from transformers import BertTokenizer,BertForSequenceClassification, BertConfig, AdamW
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [6]:
config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification(config)
dv = 'cuda:6'
model.load_state_dict(torch.load("best_bert.pt", map_location=torch.device(dv)))
device = torch.device(dv)
torch.cuda.set_device(int(dv[-1]))
model.cuda()
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
def grab_bot_accounts():
    fname = '/shared/0/projects/prosocial/known-bots.tsv'
    bots = []

    with open(fname, 'rt') as f:
        lines = f.readlines()

        for line in lines:
            bots.append(line.split('\t')[1])

    print("Known bots: %d" % len(bots))
    return bots


In [8]:
def read_in_comments(in_file, count=-1):
    comments = []
    with open(in_file, 'r', encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            try:
                comment_id, parent_id, username, subreddit, created, politics, text = line.split('\t')
                political_comment = PoliticalComment(comment_id, parent_id, username, subreddit, created, politics,
                                                     text)
                comments.append(political_comment.to_dict())
                if count > 0 and len(comments) >= count:
                    return comments
            except Exception:
                pass

    print("Total number of political comments: {}".format(len(comments)))
    return comments

In [9]:
def get_interactions(from_party, to_party):
    from_comment_ids = set(df_comments[df_comments['politics'] == from_party]['comment_id'].tolist())
    to_comment_ids = set(df_comments[df_comments['politics'] == to_party]['comment_id'].tolist())
    interactions = df_comments[(df_comments['comment_id'].isin(from_comment_ids) & df_comments['parent_id'].isin(to_comment_ids))]
    dyad=[from_party+'to'+to_party]*len(interactions)
    interactions['dyad']=dyad
    return interactions

In [10]:
def prepare_transformer_features(seq_1, max_seq_length=128, zero_pad=True, include_CLS_token=True, include_SEP_token=True):
    # Tokenize Input
    tokens = tokenizer.tokenize(seq_1)[0:max_seq_length - 2]
    # Initialize Tokens
    if include_CLS_token:
        tokens.insert(0, tokenizer.cls_token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    pad_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    # Input Mask
    input_mask = [1] * len(input_ids)
    # Zero-pad sequence length
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(pad_id)
            input_mask.append(0)
    # print(torch.tensor(input_ids).shape)
    return torch.tensor(input_ids), torch.tensor(input_mask)

In [11]:
def prepare_sentence_features(interactions):#,cats,subrredit_to_id):
    affiliation_list=['trump','bernie','biden','democrat','republic','maga','liberal','conservative']
    sensitive_list=['fuck','shit','suck','bitch','ass','pussy','piss','dick']
    #sensitive_to_id=dict(zip(sensitive_list,range(len(sensitive_list))))
    phar=0.8
    X=[]
    Y=[]
    sensitive_word_label = []
    affiliation_label=[]
    response_toxic=[]
    for idx, (comment_id, line) in tqdm(enumerate(interactions.iterrows()),total=len(interactions)):
        # print(line['subreddit'],subreddit)

        
        # subreddit= line['subreddit'] if line['subreddit'] in subreddit_to_id else 'UNK'
        # line['subreddit']=subreddit_to_id[subreddit]
        # line['dyad']=dyad2id[line['dyad']]
        line['subreddit'] = line['subreddit'] if line['subreddit'] in subreddit_to_id else 'UNK'

        # cats_dummy=[0]*8
        # cats_dummy[cats]=1
        #
        # sensitive_dummy=[0]*len(sensitive_list)

        # for sensitive_word in sensitive_list:
        #     if sensitive_word in line['username'].lower():
        #         sensitive_dummy[sensitive_to_id[sensitive_word]]=1
        #
        # subreddit_dummy.extend(cats_dummy)
        # subreddit_dummy.extend(sensitive_dummy)
        # X.append(subreddit_dummy)

        label=0
        for sensitive_word in sensitive_list:
            if sensitive_word in line['username'].lower():
                label=1
        sensitive_word_label.append(label)

        affi = 0
        for affiliation_word in affiliation_list:
            if affiliation_word in line['username'].lower():
                affi = 1
        affiliation_label.append(affi)


        text_id, mask = prepare_transformer_features(line['text'])
        text_id = text_id.unsqueeze(0).cuda()
        mask = mask.unsqueeze(0).cuda()
        output = model.forward(text_id, attention_mask=mask)[0]

        soft_output = F.softmax(output, dim=1)
        #print(soft_output,soft_output[0][1].item())
        Y.append(soft_output[0][1].item())
        #Y.append(torch.max(soft_output).item())
#         weighted = torch.tensor([phar, 1 - phar]).cuda().repeat(soft_output.shape[0]).view(soft_output.shape[0], -1)
#         weighted_output = soft_output * weighted
#         _, labels = torch.max(weighted_output, 1)
#         Y.append(labels.item())

        response_text=df_comments[df_comments['comment_id']==line['parent_id']]['text'].tolist()[0]
        #print(response_text)
        response_id, mask = prepare_transformer_features(response_text)
        response_id = response_id.unsqueeze(0).cuda()
        mask = mask.unsqueeze(0).cuda()
        output = model.forward(response_id, attention_mask=mask)[0]
        soft_output = F.softmax(output, dim=1)
        response_toxic.append(soft_output[0][1].item())

    print(sum(sensitive_word_label), sum(Y),sum(affiliation_label))
    interactions['if_have_sensitive']=sensitive_word_label
    interactions['if_have_affiliation']=affiliation_label
    interactions['parent_toxicity']=response_toxic
    interactions['toxicity']=Y

In [13]:
bots = grab_bot_accounts()
bots.extend('[deleted]')
in_file = '/shared/0/projects/reddit-political-affiliation/data/interactions/all_comments_filtered.tsv'
comments = read_in_comments(in_file)
df_comments = pd.DataFrame(comments)
#print(df_comments.head(10))
top_sub=499
subreddit_to_id=dict(zip(df_comments['subreddit'].value_counts()[:top_sub].to_dict().keys(),range(top_sub)))
subreddit_to_id['UNK']=top_sub
#print (subreddit_to_id)
print("Prepareing interactions...")
dem_to_dem = get_interactions('Democrat', 'Democrat')
rep_to_rep = get_interactions('Republican', 'Republican')
dem_to_rep = get_interactions('Democrat', 'Republican')
rep_to_dem = get_interactions('Republican', 'Democrat')
dem_to_unknown = get_interactions('Democrat', 'Unknown')
rep_to_unknown = get_interactions('Republican', 'Unknown')
unknown_to_dem = get_interactions('Unknown', 'Democrat')
unknown_to_rep = get_interactions('Unknown', 'Republican')

print("Dem to dem interactions: {}".format(len(dem_to_dem)))
print("Rep to rep interactions: {}".format(len(rep_to_rep)))
print("Dem to rep interactions: {}".format(len(dem_to_rep)))
print("Rep to dem interactions: {}".format(len(rep_to_dem)))
print("Dem to unknown interactions: {}".format(len(dem_to_unknown)))
print("Rep to unknown interactions: {}".format(len(rep_to_unknown)))
print("Unknown to dem interactions: {}".format(len(unknown_to_dem)))
print("Unknown to rep interactions: {}".format(len(unknown_to_rep)))

#print(rep_to_rep)
comment_lists = [dem_to_dem, rep_to_rep,
                 dem_to_rep, rep_to_dem,
                 dem_to_unknown, rep_to_unknown,
                 unknown_to_dem, unknown_to_rep]

dyad2id={'DemocrattoDemocrat':0,'RepublicantoRepublican':1,'DemocrattoRepublican':2,
         'RepublicantoDemocrat':3,'DemocrattoUnknown':4,'RepublicantoUnknown':5,
         'UnknowntoDemocrat':6,'UnknowntoRepublican':7}


all_data=pd.concat(comment_lists)
all_data=all_data.sample(frac=1)

Known bots: 393
Total number of political comments: 137629803
Prepareing interactions...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-490f8efe4f8b>", line 13, in <module>
    rep_to_rep = get_interactions('Republican', 'Republican')
  File "<ipython-input-10-eeeb00355f52>", line 3, in get_interactions
    to_comment_ids = set(df_comments[df_comments['politics'] == to_party]['comment_id'].tolist())
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2033, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda/lib/python3.6/site-packages/I

KeyboardInterrupt: 

In [75]:
prepare_sentence_features(all_data)

100%|██████████| 130643/130643 [2:08:51<00:00, 16.90it/s] 


2078 45473.389921963215 402


In [4]:
saved_path='/shared/0/projects/reddit-political-affiliation/data/interactions_features/real_interactions_feature.3.3.tsv'

In [44]:
all_data.to_csv(saved_path, sep = '\t')

In [5]:
all_data = pd.read_csv(saved_path,sep='\t')

In [9]:
feature_data=all_data[['toxicity', 'dyad', 'subreddit','if_have_sensitive','if_have_affiliation','parent_toxicity','text']]

In [10]:
train, test = train_test_split(feature_data, test_size=0.2)
test.head()

Unnamed: 0,toxicity,dyad,subreddit,if_have_sensitive,if_have_affiliation,parent_toxicity,text
1329,0.903556,UnknowntoDemocrat,90DayFiance,0,0,0.129833,Ok wow i didnt realize that was an uninteresti...
210115,0.756215,UnknowntoDemocrat,Firearms,0,0,0.504405,"No, I agree with you, not all would, but then ..."
8624,0.108985,DemocrattoUnknown,memes,0,0,0.492057,"Thank you for not being judgmental, I hope you..."
232733,0.578679,UnknowntoDemocrat,AskConservatives,0,0,0.059407,Not political but raised near upper class indi...
242107,0.762481,UnknowntoDemocrat,oaklandraiders,0,0,0.477639,"Dude I hear you, and I agree about Brown shoul..."


In [19]:
logitreg = smf.ols('toxicity ~  parent_toxicity + C(if_have_affiliation) + C(dyad) + C(subreddit) + C(if_have_sensitive)',data=feature_data).fit()
print(logitreg.summary())

                            OLS Regression Results                            
Dep. Variable:               toxicity   R-squared:                       0.055
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     30.48
Date:                Thu, 18 Feb 2021   Prob (F-statistic):               0.00
Time:                        10:29:24   Log-Likelihood:                -13601.
No. Observations:              263596   AIC:                         2.822e+04
Df Residuals:                  263088   BIC:                         3.354e+04
Df Model:                         507                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [20]:
result_table=logitreg.summary().tables[1]
TESTDATA = StringIO(result_table.as_csv())
df_result=pd.read_csv(TESTDATA,sep=',')

In [21]:
pd.set_option('display.max_rows', None)

In [20]:
df_result.sort_values(by=['   coef   ']).head(50)

Unnamed: 0,Unnamed: 1,coef,std err,t,P>|t|,[0.025,0.975]
141,C(subreddit)[T.LibertyRSA],-0.213,0.257,-0.83,0.406,-0.716,0.29
332,C(subreddit)[T.feckingbirds],-0.149,0.032,-4.668,0.0,-0.212,-0.086
350,C(subreddit)[T.gonewild],-0.0789,0.063,-1.246,0.213,-0.203,0.045
32,C(subreddit)[T.AskOuija],-0.0579,0.035,-1.664,0.096,-0.126,0.01
17,C(subreddit)[T.Advice],-0.0412,0.037,-1.126,0.26,-0.113,0.031
86,C(subreddit)[T.DenverBroncos],-0.0401,0.045,-0.894,0.371,-0.128,0.048
166,C(subreddit)[T.NoMansSkyTheGame],-0.0372,0.04,-0.923,0.356,-0.116,0.042
40,C(subreddit)[T.Astros],-0.0351,0.04,-0.87,0.384,-0.114,0.044
93,C(subreddit)[T.DunderMifflin],-0.0306,0.038,-0.799,0.424,-0.106,0.044
240,C(subreddit)[T.Vaping],-0.0304,0.044,-0.688,0.491,-0.117,0.056


In [22]:
df_result.sort_values(by=['   coef   '],ascending=False).head(50)

Unnamed: 0,Unnamed: 1,coef,std err,t,P>|t|,[0.025,0.975]
400,C(subreddit)[T.mistyfront],0.3007,0.256,1.173,0.241,-0.202,0.803
0,Intercept,0.2412,0.026,9.458,0.0,0.191,0.291
310,C(subreddit)[T.de],0.2248,0.036,6.262,0.0,0.154,0.295
9,C(subreddit)[T.2ALiberals],0.2074,0.038,5.47,0.0,0.133,0.282
26,C(subreddit)[T.Apustaja],0.1864,0.085,2.204,0.028,0.021,0.352
277,C(subreddit)[T.bestof],0.183,0.031,5.936,0.0,0.123,0.243
316,C(subreddit)[T.dogelore],0.1785,0.038,4.74,0.0,0.105,0.252
507,parent_toxicity,0.1676,0.002,89.626,0.0,0.164,0.171
232,C(subreddit)[T.TrueReddit],0.165,0.032,5.077,0.0,0.101,0.229
267,C(subreddit)[T.averageredditor],0.16,0.042,3.767,0.0,0.077,0.243


In [43]:
def read_in_comments(in_file, count=-1):
    comments = []
    with open(in_file, 'r', encoding="utf-8") as f:
        for line in tqdm(f,total=count if count>0 else 137629803):
            line = line.strip()
            try:
                comment_id, parent_id, username, subreddit, created, politics, text = line.split('\t')
                political_comment = PoliticalComment(comment_id, parent_id, username, subreddit, created, politics,
                                                     text)
                comments.append(political_comment.to_dict())
                if count > 0 and len(comments) >= count:
                    print("Total number of political comments: {}".format(len(comments)))
                    return comments
            except Exception:
                pass

    print("Total number of political comments: {}".format(len(comments)))
    return comments

In [44]:
in_file = '/shared/0/projects/reddit-political-affiliation/data/interactions/all_comments_filtered.tsv'
comments = read_in_comments(in_file,count=10000000)
df_comments = pd.DataFrame(comments)

20000130it [00:57, 346982.88it/s]                             


Total number of political comments: 10000000


# make users in train/dev/test distinct

In [6]:
cong_dir='/shared/0/projects/reddit-political-affiliation/data/conglomerate-affiliations/'

In [7]:
train_cong = pd.read_csv(cong_dir+'train.tsv',sep='\t')
dev_cong=pd.read_csv(cong_dir+'dev.tsv',sep='\t')
test_cong=pd.read_csv(cong_dir+'test.tsv',sep='\t')

In [8]:
train_user=set(train_cong['username'])
dev_user=set(dev_cong['username'])
test_user=set(test_cong['username'])

In [9]:
sorted_train_cong=train_cong.sort_values(["username","source"])
distinct_train_cong=sorted_train_cong.drop_duplicates(subset="username",keep="first")
distinct_train_cong=distinct_train_cong.sample(frac=1)

In [10]:
sorted_test_cong=test_cong.sort_values(["username","source"])
distinct_test_cong=sorted_test_cong.drop_duplicates(subset="username",keep="first")
distinct_test_cong=distinct_test_cong.sample(frac=1)

In [11]:
sorted_dev_cong=dev_cong.sort_values(["username","source"])
distinct_dev_cong=sorted_dev_cong.drop_duplicates(subset="username",keep="first")
distinct_dev_cong=distinct_dev_cong.sample(frac=1)

In [12]:
all_cong=pd.concat([distinct_train_cong,distinct_test_cong,distinct_dev_cong])
all_cong.head()

Unnamed: 0.1,Unnamed: 0,username,source,politics,subreddit,created
118885,136131,Zoltrahn,gold,Democrat,PublicFreakout,1465705458
129984,148714,DaddyUltra,silver,Democrat,worldnews,1563898082
23699,27274,rbest99,gold,Republican,facepalm,1558573923
89001,101912,iron_and_carbon,gold,Democrat,PoliticalDiscussion,1465982199
64749,74167,Rooshba,silver,Democrat,atheism,1573012692


In [12]:
train_df_comments=df_comments[df_comments['username'].isin(train_user)]

NameError: name 'df_comments' is not defined

In [37]:
merged_train=pd.merge(train_df_comments, distinct_train_cong, on='username')

# separate gold/silver/flair interactions 

In [14]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,comment_id,parent_id,username,subreddit,created,politics,text,FromPolitics,ToPolitics,toxicity,parent_toxicity,if_have_sensitive,if_have_affiliation,Actor
0,6117334,djy8vjb,djy8sr3,snewk,sanfrancisco,1499531534,Unknown,"but they are beautiful, they clean particulate...",Unknown,Republican,0.374072,0.130251,0,0,Normal
1,4997189,e7sxrai,e7sx99l,prontoon,worldnews,1539607200,Unknown,This movement started like a year or so ago. T...,Unknown,Republican,0.880511,0.49133,0,0,Normal
2,6583847,del6oiw,del4gqr,beachexec,WayOfTheBern,1488831850,Unknown,"I love how they're trying so hard to make ""alt...",Unknown,Democrat,0.109001,0.313587,0,0,Normal
3,5688143,e80plhm,e80omui,ken_in_nm,UNK,1539893035,Democrat,I don't think it is truncating. New imgur with...,Democrat,Unknown,0.214898,0.318056,0,0,Genunine_Actor
4,2108543,eq0o3l8,eq0nac8,tdonovanj,homeowners,1559689683,Democrat,That was my point. You can almost always find ...,Democrat,Unknown,0.071792,0.150896,0,0,Normal


In [15]:
all_data=all_data.drop(columns=['Unnamed: 0'])#,'Unnamed: 0.1.1'])
merged_all_data=pd.merge(all_data,all_cong,on='username')
merged_all_data=merged_all_data.sample(frac=1)
merged_all_data.head()

Unnamed: 0.1,comment_id,parent_id,username,subreddit_x,created_x,politics_x,text,FromPolitics,ToPolitics,toxicity,parent_toxicity,if_have_sensitive,if_have_affiliation,Actor,Unnamed: 0,source,politics_y,subreddit_y,created_y
3473927,ethfqkd,ethfl7z,domestic_omnom,PoliticalHumor,1562814242,Unknown,Sure guy attack my rounding of 49 to 50 withou...,Unknown,Republican,0.233879,0.205255,0,0,Genunine_Actor,46525,silver,Republican,politics,1374785967
974673,e6inpvi,e6imtnh,Janders2124,DenverBroncos,1537743575,Democrat,&gt; if I made some retarded comments Too late,Democrat,Unknown,0.241354,0.262173,0,0,Normal,119993,silver,Democrat,television,1499279845
3268221,c5i8uc9,c5i8q0y,illogicalexplanation,Music,1343072421,Democrat,I'm sure the community would love a chance to ...,Democrat,Unknown,0.576421,0.057318,0,0,Normal,195055,silver,Democrat,politics,1342034283
2603814,div7dm3,diuxnuq,mastercraftsportstar,ShitPoliticsSays,1497393839,Republican,Let me tut-tut your lifestyle and look down on...,Republican,Republican,0.646002,0.235528,0,0,Normal,69518,flair,Republican,Libertarian,1501055850
3219509,d3ke3ne,d3k3yim,mathiastodd21,Marvel,1464269989,Republican,Honestly I started reading Spider-man at my lo...,Republican,Unknown,0.279571,0.355014,0,0,Genunine_Actor,159292,silver,Republican,politics,1383508917


In [16]:
gold_merged_data=merged_all_data[merged_all_data['source']=='gold']
silver_merged_data=merged_all_data[merged_all_data['source']=='silver']
flair_merged_data=merged_all_data[merged_all_data['source']=='flair']

In [20]:
flair_saved_path='/shared/0/projects/reddit-political-affiliation/data/interactions_features/new_real_flair_interactions_feature.no_actor.tsv'
gold_saved_path='/shared/0/projects/reddit-political-affiliation/data/interactions_features/new_real_gold_interactions_feature.no_actor.tsv'
silver_saved_path='/shared/0/projects/reddit-political-affiliation/data/interactions_features/new_real_silver_interactions_feature.no_actor.tsv'

In [21]:
flair_merged_data.to_csv(flair_saved_path, sep = '\t')
gold_merged_data.to_csv(gold_saved_path, sep = '\t')
silver_merged_data.to_csv(silver_saved_path, sep = '\t')

In [5]:
gold_merged_data=pd.read_csv(gold_saved_path,sep='\t')
silver_merged_data=pd.read_csv(silver_saved_path,sep='\t')
flair_merged_data=pd.read_csv(flair_saved_path,sep='\t')

# generate interactions features (not useful at this notebook)

In [45]:
saved_path = '/shared/0/projects/reddit-political-affiliation/data/interactions_features/more_interactions_feature.tsv'
all_inter_data=pd.read_csv(saved_path, sep='\t')

In [63]:
all_inter_data

Unnamed: 0.1,Unnamed: 0,index,comment_id,parent_id,username,subreddit,created,politics,text,FromPolitics,ToPolitics,toxicity
0,0,934390,f0p6dad,f0ovupf,viciousrana,insanepeoplefacebook,1568828154,Unknown,Upvoting for the dickishness.,Unknown,Democrat,0.834900
1,1,1125084,evu1tba,evu1hk9,The_Real_Hank_HiII,instantkarma,1564783559,Unknown,I agree. No smart person would support a vile ...,Unknown,Democrat,0.510646
2,2,728217,eyvgloj,eyvg35h,wilstreak,indonesia,1567493426,Unknown,i want to hear more about this,Unknown,Democrat,0.061260
3,3,1382168,exlu9zr,exlh4of,USCAVsuperduperhooah,ar15,1566419431,Unknown,From my research I’ve heard they’re solid scop...,Unknown,Democrat,0.334476
4,4,1103788,evoyjub,evoyaya,spring45,Columbus,1564661685,Unknown,"""It's a New Albany expression.""",Unknown,Democrat,0.040241
...,...,...,...,...,...,...,...,...,...,...,...,...
2748359,2748359,1952507,f6hlxw0,f6hb2u2,RektzRuiz,leagueoflegends,1572854872,Unknown,You got that right,Unknown,Democrat,0.097716
2748360,2748360,1073563,f1rjifz,f1rj6ei,fearu,CFB,1569716407,Unknown,It would be 50 but they started with the ball,Unknown,Democrat,0.191371
2748361,2748361,2517081,et77mla,et751o8,throwawayifyoureugly,gundeals,1562526482,Unknown,That's why you should use [Thermal Clips](http...,Unknown,Republican,0.260440
2748362,2748362,1278578,ewtkm0w,ewsjgru,LaserToy,trump,1565747373,Unknown,"Yep, trump border wall. So huge, I’m hitting i...",Unknown,Democrat,0.862676


In [64]:
sub_data=all_inter_data[['comment_id','toxicity']]
sub_data

Unnamed: 0,comment_id,toxicity
0,f0p6dad,0.834900
1,evu1tba,0.510646
2,eyvgloj,0.061260
3,exlu9zr,0.334476
4,evoyjub,0.040241
...,...,...
2748359,f6hlxw0,0.097716
2748360,f1rjifz,0.191371
2748361,et77mla,0.260440
2748362,ewtkm0w,0.862676


In [70]:
aug_all_data=all_inter_data.merge(sub_data,left_on="parent_id",right_on="comment_id")

In [72]:
renamed_data=aug_all_data.rename(columns={'comment_id_x':'comment_id','toxicity_x':'toxicity','toxicity_y':'parent_toxicity'}).drop(columns=['Unnamed: 0','comment_id_y'])

In [73]:
renamed_data

Unnamed: 0,index,comment_id,parent_id,username,subreddit,created,politics,text,FromPolitics,ToPolitics,toxicity,parent_toxicity
0,331401,evy65dh,evu8tzt,ROBLOXBROS18293748,EmojiPolice,1564923778,Democrat,"Yep, and they say ""STOP IT FUCKING NORMIE R/EM...",Democrat,Unknown,0.931184,0.436960
1,1991836,f6t2m74,f6t25r3,SeaSickSpartan1,memes,1573143127,Unknown,"Thx, I'm not mad either, I was just kinda hungry",Unknown,Democrat,0.067160,0.060147
2,1121022,evt8qho,evt8hsc,ohaimike,girlsfrontline,1564765198,Unknown,Jill brings it up when talking about them. I f...,Unknown,Democrat,0.373153,0.088180
3,308672,f1en5j7,f1elyjz,spideyjiri,marvelstudios,1569425970,Democrat,&gt;I’m still waiting for a Superman movie to ...,Democrat,Unknown,0.484897,0.231856
4,308951,f1evy40,f1elyjz,GreyCrowDownTheLane,marvelstudios,1569431478,Democrat,&gt; He also didn’t like Spider-Man very much ...,Democrat,Unknown,0.291255,0.231856
...,...,...,...,...,...,...,...,...,...,...,...,...
658199,515717,evfuj6g,evftjap,Z0di,agedlikemilk,1564501503,Democrat,"yes, notice that actual vitiligo doesn't chang...",Democrat,Unknown,0.606897,0.151970
658200,422848,esuxdfk,esuw9nc,speakeasy518,justneckbeardthings,1562290408,Democrat,r/selfawarewolves,Democrat,Unknown,0.171966,0.708118
658201,293738,f0tio87,f0tii61,funkygirljulia,JustUnsubbed,1568922809,Democrat,You’d get *SOOOOOO* downvoted!,Democrat,Unknown,0.252154,0.224503
658202,518941,evl9bpm,evl74ss,ScarthMoonblane,AdviceAnimals,1564591906,Democrat,You're taking about taking more than 28 millio...,Democrat,Unknown,0.301472,0.404860
