# _Experimentation_

**TL:DR** --> Experiment with regex and `re` library. Hoping to build a multitude of functions that we can utilize to narrow down the observations that contain misinformation.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# import libraries
import fundamentals
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import string
import os
import re
from tqdm.autonotebook import tqdm
tqdm.pandas()

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

  
  from pandas import Panel


## _Load Data_

In [2]:
# strings of file paths and file name for data
origpath = "/notebooks/CovidDisinfo-Detect/experiments"
datapath = "/notebooks/CovidDisinfo-Detect/data/interim"
filename = "covid19_20200324.pkl"

# load data into pandas dataframe
df = fundamentals.load_data(origpath, datapath, filename)

## _Regex & `re`_

In [3]:
# search for newline characters
def newline_search(text):
    """
    Searches for newline characters in 
    """
    regex = re.compile(r"\n+", re.I)
    if regex.findall(text) == []:
        return False
    else:
        return True
    
def newline_sub(text):
    """
    Removes newline characters from text.
    """
    regex = re.compile(r"\n+", re.I)
    if regex.findall(text) == []:
        return text
    else:
        return regex.sub(r" ", text)

In [4]:
# create a new column without the newline characters
df["clean_tweet"] = df["tweet"].progress_apply(newline_sub)

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [5]:
# check to make sure our cleaning worked
df["clean_tweet"].apply(newline_search).value_counts(dropna=False)

False    1927244
Name: clean_tweet, dtype: int64

### _`china_search`_

In [6]:
# create a function that searches for China or Chinese 
def china_search(text):
    """
    Searches for a given term within a text.
    """
    regex = re.compile(r"china|chinese", re.I)
    if regex.findall(text) == []:
        return "N/a"
    elif regex.findall(text) != []:
        return ",".join(regex.findall(text))

In [7]:
df["clean_tweet"][:20].apply(china_search)

created_at
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:59+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:58+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
2020-03-23 06:59:57+00:00    N/a
Name: clean_tweet, dtype: object

In [8]:
df["chinese_search"] = df["clean_tweet"].progress_apply(lambda x: china_search(x))

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [9]:
df["chinese_search"].value_counts()[1:30]

China                      29847
Chinese                    15691
China,China                 4291
china                       3182
China,china                 3118
Chinese,Chinese             1635
Chinese,China               1611
China,Chinese               1575
chinese                     1236
China,China,China            857
Chinese,chinese              707
CHINA                        622
CHINESE                      392
China,China,china            293
Chinese,China,China          289
China,Chinese,China          264
Chinese,china                256
China,China,Chinese          255
Chinese,Chinese,Chinese      236
china,china                  228
China,Chinese,Chinese        226
Chinese,China,Chinese        219
china,China                  186
China,China,China,China      167
Chinese,Chinese,China        154
China,china,china            141
China,chinese                132
china,chinese                 96
china,china,Chinese           95
Name: chinese_search, dtype: int64

### _`usa_search`_

In [10]:
# create a function that searches for US-related terms
def usa_search(text):
    """
    Searches for US-related terms in given text
    """
    regex = re.compile(r"usa|united states|america|american", re.I)
    if regex.findall(text) == []:
        return "N/a"
    elif regex.findall(text) != []:
        return ",".join(regex.findall(text))

In [11]:
df["usa_search"] = df["clean_tweet"].progress_apply(lambda x: usa_search(x))

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [14]:
df["usa_search"].value_counts()[1:25]

America                        34912
usa                            16176
USA                            12145
United States                   3520
america                         2938
America,America                 2567
America,america                 1922
usa,usa                         1502
USA,USA                         1029
AMERICA                          771
USA,usa                          580
America,USA                      400
usA                              387
USA,America                      386
United States,America            331
America,usa                      284
usa,America                      258
usa,USA                          244
Usa                              236
America,America,America          231
america,america                  159
USA,USA,USA                      133
uSa                              133
United States,United States      115
Name: usa_search, dtype: int64

### _`bioweapon_search`_

In [15]:
def bioweapon_search(text):
    """
    Searches for 'bioweapon' or 'biological weapon' in text.
    """
    regex = re.compile(r"(bio[\s]?weapon[s]?)+|(biological weapon[s]?)", re.I)
    return ",".join([word.group() for word in regex.finditer(text)])
    #if regex.findall(text) == []:
    #    return "N/a"
    #elif regex.findall(text) != []:
    #    return ",".join(regex.findall(text))

In [16]:
df["bioweapon_search"] = df["clean_tweet"].progress_apply(bioweapon_search)

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [53]:
# number of observations with each value, representing the number of times bioweapon-related term appeared in text
df["bioweapon_search"].value_counts()[:26]

                                1926175
bioweapon                           288
biological weapon                   196
bio weapon                          102
BioWeapon                            63
Bioweapon                            49
bioweapon,bioweapon                  48
bioweapons                           41
biological weapons                   37
Biological Weapon                    34
bio weapons                          22
Bioweapon,bioweapon                  18
BioWeapons                           17
Biological weapon                    15
Biological Weapons                   15
BIOWEAPON                            12
Bio weapon                           12
Bioweapons                           11
Bio Weapon                           10
BIOLOGICAL WEAPON                     7
Biological weapons                    7
BIO WEAPON                            6
biological weapon,bioweapons          4
bioweapons,bioweapons                 3
Bio weapons                           3


### _`us_bioweapon`_

In [18]:
text = "COVID-19 covid19 is usa  a american america bioweapons bio weapon biological weapon"
text2 = "What if the covid covid19 covid-19 coVID19 corona virus virus was an american-made bio weapon bioweapons?"
text3 = "What if this is an american bioweapon?"

In [19]:
def us_bioweapon(text):
    """
    Searches for combination of US-related terms & bioweapon-related terms.
    """
    re_text = r"(american|america|united states|usa)+|(bio[\s]?weapon[s]?)+|(biological weapon[s]?)"
    regex = re.compile(re_text, re.I)
    return ",".join([x.group() for x in regex.finditer(text)])

In [20]:
us_bioweapon(text)

'usa,american,america,bioweapons,bio weapon,biological weapon'

In [22]:
#%%timeit
#df["clean_tweet"][:1].str.extractall(r"(covid[-\s]?19)+|(covid)|(corona[\s]?virus)", re.I)

### _`qanon_search`_

In [50]:
text4 = "Is Jordan Sather Jorden Sathar and QAnon qanon q anon right about Covid19?"

In [51]:
# function that searches for instances of qanon and Jordan Sather (YouTouber linked to QAnon)
def qanon_search(text):
    """
    Searchs for terms 'qanon' & 'Jordan Sather' in given text
    """
    regex = re.compile(r"jord[ae]n sath[ae]r|(q(\s)?anon)+", re.I)
    return ",".join([x.group() for x in regex.finditer(text)])

In [52]:
qanon_search(text4)

'Jordan Sather,Jorden Sathar,QAnon,qanon,q anon'

In [54]:
# create new column that applies qanon_search function to clean_tweet column
df["qanon_search"] = df["clean_tweet"].progress_apply(qanon_search)

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [55]:
df["qanon_search"].value_counts()[:26]

                     1926230
QAnon                    591
Qanon                    166
qanon                    118
QANON                     45
QAnon,QAnon               35
QANON,QANON               16
QAnon,QAnon,QAnon          7
QAnon,qanon                5
qanon,QAnon                3
Qanon,QAnon                3
Qanon,qanon                3
QAnon,Qanon                3
Q Anon                     2
q anon                     2
Qanon,Qanon                2
QAnon,QAnon,qanon          1
Qanon,QAnon,qanon          1
QANON,QAnon                1
Qanon,QANON                1
Q ANON                     1
Q anon                     1
QAnon,QAnon,Qanon          1
qAnon                      1
qanon,qanon                1
QAnon,QANON                1
Name: qanon_search, dtype: int64

### _`boiled_ginger_search`_

In [74]:
text5 = "Did you know that apparently boiled ginger ginger on an empty stomach can kill the coronavirus?"

In [87]:
def boiled_ginger_search(text):
    """
    Searches for potential instances espousing fake "cure" of consuming boiled ginger on an empty stomach.
    """
    regex = re.compile(r"boiled|(ginger)+|empty|(stomach)+", re.I)
    return ",".join([x.group() for x in regex.finditer(text)])

In [88]:
boiled_ginger_search(text5)

'boiled,ginger,ginger,empty,stomach'

In [89]:
df["boiled_ginger_search"] = df["clean_tweet"].progress_apply(boiled_ginger_search)

HBox(children=(FloatProgress(value=0.0, max=1927244.0), HTML(value='')))




In [90]:
df["boiled_ginger_search"].value_counts()

                                                   1922276
empty                                                 3129
Empty                                                  505
stomach                                                431
empty,empty                                            186
ginger                                                 185
Ginger                                                 100
Empty,empty                                             92
boiled                                                  67
EMPTY                                                   44
Stomach,stomach                                         38
Stomach                                                 34
empty,stomach                                           26
stomach,stomach                                         16
BOILED                                                  12
Boiled                                                   9
empty,Empty                                             