# <center> Process WhatsApp messages from NDOH

## Read in WhatsApp messages

- Currently saved as **`.txt`** files in **```data/doh_whatsapp```**

In [1]:
# TODO : find out how these files are saved/ stored ? - Is it manual ?

In [2]:
# create a list of dates that have been extracted.
# read in all text files
# check if text file has been processed
# if it has then move on to next file
# if it has not, process. 
# processing : extract date, province and infection count
# make sure data is correct
# append to canonical dataset (optionally, create a csv for every text file)

## Import Libraries

In [3]:
import pandas as pd
from pathlib import Path
import regex
import string

In [4]:
# read data
data_path = Path("../data/doh_whatsapp/")
data_files = list(data_path.glob("*.txt"))

In [5]:
data_files =sorted(data_files)    # sort data , easier to debug - more intuitive 
print(f"There are {len(data_files)} NDOH WhatsApp messages in .txt format")

There are 20 NDOH WhatsApp messages in .txt format


## Create Regex Queries

In [6]:
kzn_query = r'(kwazulu[\s-–]*natal|kzn)[\s:,]+(\d{1,5})'    # use dash and or hyphen (different ascii representations)
kzn_query_1 = r'(kwazulu[\s-–]*natal|kzn)[ :,]*(\d{1,5})'    # use an actual space instead of \s because \s includes \r and \n
kzn_query_2 = r'(\d{1,5}[ :,]*kwazulu[\s-–]*natal|\d{1,5}kzn)'

gp_query = r'(gauteng)[\s:,]*(\d{1,5})'
gp_query_1 = r'(gauteng)[ :,]*(\d{1,5})'
gp_query_2 = r'(\d{1,5})[ :,]*(gauteng)'

wp_query = r'(Western Cape)[\s:,]*(\d{1,5})'
wp_query_1 = r'(Western Cape)[ :,]*(\d{1,5})'
wp_query_2 = r'(\d{1,5})[ :,]+(Western Cape)'

fs_query = r'(Free State)[\s:,]*(\d{1,5})'
fs_query_1 = r'(Free State)[ :,]*(\d{1,5})'
fs_query_2 = r'(\d{1,5})[ :,]*(Free State)'

nw_query = r'(North West)[\s:,]*(\d{1,5})'
nw_query_1 = r'(North West)[ :,]*(\d{1,5})'
nw_query_2 = r'(\d{1,5})[ :,]*(North West)'

mp_query = r'(Mpumalanga)[\s:,]*(\d{1,5})'
mp_query_1 = r'(Mpumalanga){s<=2:[a-z]}[ :,]*(\d{1,5})'    # to save against misspellings we allow at most 2 substitutions from [a-z]
mp_query_2 = r'(\d{1,5})[ :,]*(Mpumalanga){s<=2:[a-z]}'

limp_query = r'(Limpopo)[\s:,]*(\d{1,5})'
limp_query_1 = r'(Limpopo)[ :,]*(\d{1,5})'
limp_query_2 = r'(\d{1,5})[ :,]*(Limpopo)'

ec_query = r'(Eastern Cape)[\s:,]*(\d{1,5})'
ec_query_1 = r'(Eastern Cape)[ :,]*(\d{1,5})'
ec_query_2 = r'(\d{1,5})[ :,]*(Eastern Cape)'

nc_query = r'(Northern Cape)[\s:,]*(\d{1,5})'
nc_query_1 = r'(Northern Cape)[ :,]*(\d{1,5})'
nc_query_2 = r'(\d{1,5})[ :,]*(Northern Cape)'

unk_query = r'(UNALLOCATED|Unknown)[\s:,]*(\d{1,5})'
unk_query_1 = r'(UNALLOCATED|Unknown)[ :,]*(\d{1,5})'
unk_query_2 = r'(\d{1,5})[ :,]*(UNALLOCATED|Unknown)'

In [7]:
# TODO : limit search to just the number breakdown portion of message
query_dict = {'KZN' : [kzn_query_1, kzn_query_2],
              'Gauteng' : [gp_query_1, gp_query_2],
              'Western Cape' : [wp_query_1, wp_query_2], 
              'Free State' : [fs_query_1, fs_query_2],
              'North West' : [nw_query_1, nw_query_2], 
              'Mpumalanga' : [mp_query_1, mp_query_2],
              'Limpopo' : [limp_query_1, limp_query_2],
              'Eastern Cape' : [ec_query_1, ec_query_2],
              'Northern Cape' : [nc_query_1, nc_query_2],
              'Unknown' : [unk_query_1, unk_query_2]}

In [8]:
# TODO : make this loopy code more effecient
dates = []
provinces = []
counts = []

for i, data_file in enumerate(data_files):
    date = data_file.stem.split("-whatsapp-cases")[0]    # we assume that the naming convention for the stored text file is consistent
    print(f"Processing data for {date} : {i+1} of {len(data_files)}") 
         
    match_counter = 0    # keep track of how many regex pattern matches we get per file.
    
    with open(data_file,'r') as reader :
        data = reader.readlines()
        
    data_string = " ".join([d for d in data])    # convert data to a single string, could maybe use re.MULTILINE instead ?
        
    for province, province_query in query_dict.items():
        for query in province_query:
            match = regex.search(query, data_string, regex.IGNORECASE)
            if match:
                m = match.group()
                count = "".join([s for s in m if s.isdigit()])   # strip infection count from string
                
                counts.append(int(count))
                provinces.append(province)
                dates.append(date)
                
                match_counter+=1

# 

Processing data for 2020-03-16 : 1 of 20
Processing data for 2020-03-17 : 2 of 20
Processing data for 2020-03-20 : 3 of 20
Processing data for 2020-03-23 : 4 of 20
Processing data for 2020-03-24 : 5 of 20
Processing data for 2020-03-25 : 6 of 20
Processing data for 2020-03-26 : 7 of 20
Processing data for 2020-03-27 : 8 of 20
Processing data for 2020-03-28 : 9 of 20
Processing data for 2020-03-29 : 10 of 20
Processing data for 2020-03-30 : 11 of 20
Processing data for 2020-03-31 : 12 of 20
Processing data for 2020-04-02 : 13 of 20
Processing data for 2020-04-03 : 14 of 20
Processing data for 2020-04-04 : 15 of 20
Processing data for 2020-04-05 : 16 of 20
Processing data for 2020-04-08 : 17 of 20
Processing data for 2020-04-10 : 18 of 20
Processing data for 2020-04-12 : 19 of 20
Processing data for 2020-04-13 : 20 of 20


In [9]:
df = pd.DataFrame({'Date':dates,
                   'Province':provinces,
                   'Infection Count':counts})

In [10]:
df.head(20)

Unnamed: 0,Date,Province,Infection Count
0,2020-03-16,KZN,12
1,2020-03-16,Gauteng,31
2,2020-03-16,Western Cape,16
3,2020-03-16,Mpumalanga,2
4,2020-03-16,Limpopo,1
5,2020-03-17,KZN,16
6,2020-03-17,Gauteng,45
7,2020-03-17,Western Cape,21
8,2020-03-17,Mpumalanga,2
9,2020-03-17,Limpopo,1


In [11]:
df.tail(20)

Unnamed: 0,Date,Province,Infection Count
165,2020-04-12,KZN,443
166,2020-04-12,Gauteng,865
167,2020-04-12,Western Cape,587
168,2020-04-12,Free State,96
169,2020-04-12,North West,19
170,2020-04-12,Mpumalanga,21
171,2020-04-12,Limpopo,23
172,2020-04-12,Eastern Cape,88
173,2020-04-12,Northern Cape,16
174,2020-04-12,Unknown,15


In [12]:
by_date = df.groupby('Date')
by_province = df.groupby('Province')

In [13]:
by_province.get_group('KZN')

Unnamed: 0,Date,Province,Infection Count
0,2020-03-16,KZN,12
5,2020-03-17,KZN,16
10,2020-03-20,KZN,23
16,2020-03-23,KZN,60
26,2020-03-24,KZN,80
35,2020-03-25,KZN,91
45,2020-03-26,KZN,134
55,2020-03-27,KZN,134
65,2020-03-28,KZN,156
75,2020-03-29,KZN,167


In [14]:
# could change this to read two csvs instead - check with Vukosi on desired functionality 

def diff_by_dates(df, date1, date2):
    
    if date1>date2:
        dt1 = date1
        dt2 = date2
    else:
        dt2 = date1
        dt1 = date2
    
    by_date = df.groupby('Date')
    
    try:
        dt1_df = by_date.get_group(dt1)
        dt2_df = by_date.get_group(dt2)
        
        diff = (dt1_df.set_index(['Province'])['Infection Count'] - dt2_df.set_index(['Province'])['Infection Count']).reset_index()
        
        return diff
    
    except KeyError as e:
        print(f"Data for {e} not found")
        return None

In [15]:
diff_by_dates(df, '2020-04-13','2020-04-10')

Unnamed: 0,Province,Infection Count
0,KZN,53
1,Gauteng,89
2,Western Cape,76
3,Free State,22
4,North West,4
5,Mpumalanga,2
6,Limpopo,-1
7,Eastern Cape,36
8,Northern Cape,1
9,Unknown,7
