# Regex experiments

In [12]:
import re

In [35]:
doc = "Regular expressions can be concatenated to form new regular expressions; 08 " \
"if A and B are both regular expressions, then AB is also a regular expression. 12 " \
"In general, if a string p matches A and another string q matches B, the string pq will match AB. 09 " \
"This holds unless A or B contain low precedence operations; boundary conditions between A and B; 19 " \
"or have numbered group references. 22 Thus, complex expressions can easily be constructed 145 " \
"from simpler primitive expressions like the ones described here."

In [16]:
re.search('AB', doc)

<re.Match object; span=(119, 121), match='AB'>

In [17]:
doc[119:121]

'AB'

In [18]:
if re.search('AB', doc):
    print('AB found')
else:
    print('AB not found')

AB found


In [22]:
re.search('[0-9][0-9]', doc)

<re.Match object; span=(73, 75), match='08'>

In [28]:
re.findall('[0-9]*[0-9]', doc)

['08', '12', '09', '19', '22', '145']

In [37]:
re.search('^[S|R]', doc)

<re.Match object; span=(0, 1), match='R'>

In [36]:
re.search('\.$', doc)

<re.Match object; span=(515, 516), match='.'>

In [39]:
re.search(';.[0-9][0-9]', doc)

<re.Match object; span=(71, 75), match='; 08'>

In [42]:
email = "aaa#bbb.com"

In [43]:
email_pattern = '^[a-zA-Z0-9]*@[a-zA-Z0-9]*\.[a-zA-Z][a-zA-Z][a-zA-Z]$'
re.search(email_pattern, email)

# NumPy experiments

In [69]:
import numpy as np

In [70]:
values = np.array([78, 45, 90, 16, 44, 60, 92, 46, 72])

In [71]:
values.mean()

60.333333333333336

In [76]:
values2d = values.reshape(3, 3)
values2d

array([[78, 45, 90],
       [16, 44, 60],
       [92, 46, 72]])

In [77]:
values2d.max(axis=0)

array([92, 46, 90])

In [80]:
values.reshape(1, 9)

array([[16, 44, 45, 46, 60, 72, 78, 90, 92]])

In [81]:
values.max()

92

In [83]:
np.sort(values)

array([16, 44, 45, 46, 60, 72, 78, 90, 92])

# Pandas experiments

In [84]:
import pandas as pd

In [85]:
# Download this file from https://www.kaggle.com/yamqwe/omicron-covid19-variant-daily-cases 
odf = pd.read_csv('covid-variants.csv')

In [86]:
odf.shape

(100416, 6)

In [87]:
odf.head()

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-07-06,Alpha,0,0.0,3
1,Angola,2020-07-06,B.1.1.277,0,0.0,3
2,Angola,2020-07-06,B.1.1.302,0,0.0,3
3,Angola,2020-07-06,B.1.1.519,0,0.0,3
4,Angola,2020-07-06,B.1.160,0,0.0,3


In [88]:
odf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100416 entries, 0 to 100415
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   location             100416 non-null  object 
 1   date                 100416 non-null  object 
 2   variant              100416 non-null  object 
 3   num_sequences        100416 non-null  int64  
 4   perc_sequences       100416 non-null  float64
 5   num_sequences_total  100416 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 4.6+ MB


In [89]:
odf.describe()

Unnamed: 0,num_sequences,perc_sequences,num_sequences_total
count,100416.0,100416.0,100416.0
mean,72.171676,6.154355,1509.582457
std,1669.262169,21.898989,8445.291772
min,0.0,-0.01,1.0
25%,0.0,0.0,12.0
50%,0.0,0.0,59.0
75%,0.0,0.0,394.0
max,142280.0,100.0,146170.0


In [94]:
country_df = odf["location"]

In [95]:
country_df

0           Angola
1           Angola
2           Angola
3           Angola
4           Angola
            ...   
100411    Zimbabwe
100412    Zimbabwe
100413    Zimbabwe
100414    Zimbabwe
100415    Zimbabwe
Name: location, Length: 100416, dtype: object

In [99]:
country_df.shape

(100416,)

In [100]:
country_df.describe()

count         100416
unique           121
top       Bangladesh
freq            1080
Name: location, dtype: object

In [101]:
odf[odf["location"] == "Austria"]

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
3384,Austria,2020-05-11,Alpha,0,0.00,34
3385,Austria,2020-05-11,B.1.1.277,0,0.00,34
3386,Austria,2020-05-11,B.1.1.302,0,0.00,34
3387,Austria,2020-05-11,B.1.1.519,0,0.00,34
3388,Austria,2020-05-11,B.1.160,0,0.00,34
...,...,...,...,...,...,...
4411,Austria,2021-12-13,Omicron,27,14.75,183
4412,Austria,2021-12-13,S:677H.Robin1,0,0.00,183
4413,Austria,2021-12-13,S:677P.Pelican,0,0.00,183
4414,Austria,2021-12-13,others,0,0.00,183


In [102]:
odf.groupby("location").sum()

Unnamed: 0_level_0,num_sequences,perc_sequences,num_sequences_total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Angola,1325,3507.65,25320
Argentina,10689,6837.93,201864
Aruba,3302,3002.21,72000
Australia,62668,6502.24,1132776
Austria,14627,6356.05,301920
...,...,...,...
United States,2420758,6738.15,49960248
Uruguay,1189,4592.69,16368
Vietnam,1811,2350.00,43320
Zambia,1565,4624.63,26808


In [103]:
odf.groupby("location").count()

Unnamed: 0_level_0,date,variant,num_sequences,perc_sequences,num_sequences_total
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Angola,672,672,672,672,672
Argentina,1056,1056,1056,1056,1056
Aruba,600,600,600,600,600
Australia,1056,1056,1056,1056,1056
Austria,1032,1032,1032,1032,1032
...,...,...,...,...,...
United States,1080,1080,1080,1080,1080
Uruguay,576,576,576,576,576
Vietnam,504,504,504,504,504
Zambia,816,816,816,816,816
