In [1]:
import pandas as pd
import re
import requests
from tqdm import tqdm
from urllib.parse import urlparse


In [2]:
train = pd.read_csv('/content/drive/MyDrive/Constraint/data/Constraint_Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Constraint/data/Constraint_Test.csv')

In [3]:
train.head(2)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real


In [4]:
tweets = list(train['tweet'])
labels = list(train['label'])

In [5]:
def get_domain(row):
  tweet=row["tweet"]
  try:
    shorturl = re.search("(?P<url>https?://[^\s]+)", tweet).group("url")
    r = requests.get(shorturl, timeout=180)
    expanded_url = r.url 
    # temp_domain = expanded_url.split('/')[2]
    domain = urlparse(expanded_url).netloc
    print(domain)
  except:
    domain=''

  return domain

train["domain"] = train.apply(lambda x: get_domain(x), 1)
test["domain"] = test.apply(lambda x: get_domain(x), 1)


twitter.com
www.thespoof.com
twitter.com
twitter.com
www.wandtv.com
t.co
www.thespoof.com
www.thelancet.com
t.co
www.politifact.com
investors.modernatx.com
t.co
twitter.com
investors.modernatx.com
www.abc.net.au
www.medscape.com
www.thespoof.com
www.medscape.com
www.politifact.com
t.co
www.thespoof.com
twitter.com
twitter.com
www.who.int
news.sky.com
twitter.com
twitter.com
twitter.com
twitter.com
www.cdc.gov
twitter.com
waterfordwhispersnews.com
www.thespoof.com
www.thespoof.com
news.sky.com
www.icmr.gov.in
news.sky.com
www.thespoof.com
twitter.com
gisgmda.maps.arcgis.com
twitter.com
news.sky.com
twitter.com
twitter.com
twitter.com
www.medscape.com
twitter.com
pib.gov.in
twitter.com
www.thespoof.com
arogya.maharashtra.gov.in
news.sky.com
www.cdc.gov
twitter.com
www.medscape.com
news.sky.com
twitter.com
inbministry.blogspot.com
www.medscape.com
twitter.com
covidtracking.com
twitter.com
www.medrxiv.org
covidactnow.org
twitter.com
twitter.com
www.factchecker.in
twitter.com
t.co
waterford

# Getting Unique URL Domains

In [11]:
unique_train_domains = [each for each in list(set(list(train["domain"]))) if each!='']
unique_test_domains = [each for each in list(set(list(test["domain"]))) if each!='']

train_domains = list(train["domain"])
test_domains = list(test["domain"])


In [12]:
# def get_domain(tweet):
#   try:
#     shorturl = re.search("(?P<url>https?://[^\s]+)", tweet).group("url")
#     r = requests.get(shorturl, timeout=180)
#     expanded_url = r.url 
#     # temp_domain = expanded_url.split('/')[2]
#     domain = urlparse(expanded_url).netloc
#     print(domain)
#   except:
#     domain=''
#   return domain


In [18]:
train.head()

Unnamed: 0,id,tweet,label,domain
0,1,The CDC currently reports 99031 deaths. In gen...,real,
1,2,States reported 1121 deaths a small rise from ...,real,twitter.com
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake,www.thespoof.com
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real,twitter.com
4,5,Populous states can generate large case counts...,real,twitter.com


# Computing "Fake" and "Real" probability values for URL domains

In [19]:
label_freq_dic = {}

for dom in unique_train_domains:
  label_freq_dic[dom] = {"real": 0, "fake": 0}

for dom, lab in zip(train_domains, labels):
  try:
    label_freq_dic[dom][lab]+=1
  except:
    pass

In [26]:
def calculate_prob(row, key):
  return row[key]/(row["real"]+row["fake"])

def calculate_total(row):
  return row["real"]+row["fake"]
  

In [32]:
label_freq_df = pd.DataFrame(label_freq_dic).T.reset_index()
label_freq_df = label_freq_df.rename(columns = {"index": "domain"})
label_freq_df["real_probability"] = label_freq_df.apply(lambda x: calculate_prob(x, "real"), 1)
label_freq_df["fake_probability"] = label_freq_df.apply(lambda x: calculate_prob(x, "fake"), 1)
label_freq_df["total_mentions"] = label_freq_df.apply(lambda x: calculate_total(x), 1)
label_freq_df = label_freq_df.sort_values("total_mentions", ascending=False)
label_freq_df = label_freq_df.drop(["real", "fake"], 1)
label_freq_df.head()

Unnamed: 0,domain,real_probability,fake_probability,total_mentions
15,twitter.com,0.838903,0.161097,1167
139,news.sky.com,1.0,0.0,274
56,www.medscape.com,1.0,0.0,258
104,www.thespoof.com,0.0,1.0,253
14,t.co,0.994819,0.005181,193


In [38]:
dic = label_freq_df.set_index('domain').T.to_dict('dict')

In [39]:
# dic

{'accounts.google.com': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 3.0},
 'amp.cnn.com': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 2.0},
 'arogya.maharashtra.gov.in': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 6.0},
 'blog.covidactnow.org': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 6.0},
 'blog.rootclaim.com': {'fake_probability': 1.0,
  'real_probability': 0.0,
  'total_mentions': 1.0},
 'bmjopen.bmj.com': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 1.0},
 'brazilian.report': {'fake_probability': 1.0,
  'real_probability': 0.0,
  'total_mentions': 1.0},
 'businessinthenews.co.uk': {'fake_probability': 1.0,
  'real_probability': 0.0,
  'total_mentions': 1.0},
 'ccc19.org': {'fake_probability': 0.0,
  'real_probability': 1.0,
  'total_mentions': 1.0},
 'consultqd.clevelandclinic.org': {'fake_probability': 1.0,
  'real_probability': 0.0,
 

In [40]:
import json

with open('/content/drive/MyDrive/Constraint/Analysis/train_prob_vectors_domain.json', 'w') as fp:
    json.dump(dic, fp)

In [None]:
domains

In [None]:
freq_dic={}

for un in unique_domains:
  freq_dic[un]=0

for un in domains:
  freq_dic[un]+=1
  

In [None]:
prob_dic = {}

for i, j in label_freq_dic.items():
  try:
    fp = j["fake"]/(j["real"]+j["fake"])
  except:
    fp=0
  try:
    rp = j["real"]/(j["real"]+j["fake"])
  except:
    rp=0
  prob_dic[i] = {"fake_probability": fp, "real_probability": rp, "total_mentions": j["real"]+j["fake"]}

# prob_dic = {k: v for k, v in sorted(x.items(), key=lambda item: item[1])}

In [None]:
import collections

final_dic = collections.OrderedDict(sorted(prob_dic.items(), key=lambda t:t[1]["total_mentions"], reverse=True))


In [None]:
final_dic