# Datetime String Parsing/Formatting
## Research Questions
**What are the most common string formats used for datetimes?**
 * What are the most common formats for certain usecases? (logging, external storage, UI)

**How often do people follow standards vs. use their own formats?**
 * iso format compliance
 * builtin methods vs custom formats

**How often do people use format agnostic parsers?**
 * How often are there bugs related to unclear date formats?
 * These are hard to detect because most of them are just named "parse"

**How consistent are people with formats within a single project?**
 * How many calls do people make to parse/format methods per project
 * How many formats are used within strftime and strptime in each project

**How much intention are people putting into their formats?**
 * Not quantifiable at all
 * strftime vs builtins
 * "least resistance methods" like `str()` or format strings.
 
**How often do people store timezone information in their format strings?**
 * I'm guessing not often
 
## Issues with the current scripts
 * Hard to detect `from a import b; b()`. Most `parse` methods fall under this category.
 * Not currently distinguishing between `time.strftime` and `datetime.strftime`.
 * Can't properly handle multiline calls.
 * Can't properly handle format constants e.g. `FORMAT = '%Y%m%d'; dt.strftime(FORMAT)`.
 * False positives on comments mentioning method names.
 * Doesn't detect str() or format strings

In [None]:
%pip install pandas
%pip install numpy
# %pip install seaborn

import pandas as pd
import math
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import re


# Read the data from the TSV file
# STRING_OPS_PATH = '/data/petervan/date-time/data/string_operations.csv'
STRING_OPS_PATH = 'string_operations.csv'
df = pd.read_csv(STRING_OPS_PATH)
print(len(df))
df

# df[["owner", "repo", "path", "line", "operation", "text"]].to_csv(STRING_OPS_PATH, index=False)

In [None]:
def get_even_colors(cmap, n):
    if type(cmap) == str:
        cmap = matplotlib.colormaps[cmap]
    return [cmap(i/(n))for i in range(n)]

In [None]:
# What is the distribution of number of calls per repo

counter = dict()
for i, row in df.iterrows():
    counter[(row["owner"], row["repo"])] = counter.get((row["owner"], row["repo"]), 0) + 1

maxcount = max(list(counter.values()))
for repo in counter:
    if counter[repo] == maxcount:
        print(repo, maxcount)

count_dist = [0] * (maxcount+1)

for v in counter.values():
    count_dist[v] += 1

# plot

fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(121)
ax.set_title("Calls per repo")
ax.set_xlabel("Number of Calls")
ax.set_ylabel("Number of Repos")
ax.plot(range(maxcount+1), count_dist)

ax = fig.add_subplot(122)
ax.set_title("Calls per repo (log)")
ax.set_xlabel("Number of Calls")
ax.set_ylabel("Number of Repos")
ax.plot(range(maxcount+1), count_dist)
ax.set_xscale('log')
ax.set_yscale('log')

In [None]:
# What is the distribution of operations
categories, counts = np.unique(df["operation"], return_counts=True)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("Formatting vs Parsing")
colors = get_even_colors("twilight", 3)
ax.pie(x=counts, labels=categories, colors = colors, autopct="%.4g")
ax.legend()

In [None]:
# What is the distribution of patterns
categories, counts = np.unique(df["pattern"], return_counts=True)
counts, categories = zip(*sorted(list(zip(counts, categories))))

fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title("Patterns")
colors = get_even_colors("twilight", len(categories))
ax.pie(x=counts, labels=categories, colors = colors)
0
# ax.legend(loc="upper right")

## Finding common string formats
For now I'm only going to consider the following patterns:
 * `strftime` (datetime and time)
 * `strptime` (datetime and time)
 * `ctime`
 * `isoformat`
 
These make up around 90% of the table, and make up basically all of the patterns that use formats (i.e. aren't format agnostic parsers).

In [None]:
filtered_df = df[df["pattern"].isin(["strftime", "strptime", "ctime", "isoformat"])]
print(len(filtered_df) / len(df))

In [None]:
# first, strftime formats because they're easiest to parse
pattern = "strftime\\(\\s*[\"\'](.*?)[\"\']\\s*\\)"
print(pattern)
strf_counts = dict()
for i,row in df.iterrows():
    if row["pattern"] == "strftime":
        match = re.search(pattern, row["text"])
        if match is not None:
            strf_counts[match.group(1)] = strf_counts.get(match.group(1), 0) + 1

sorted_strf_counts = sorted(list(zip(strf_counts.values(), strf_counts.keys())), reverse=True)
for i in range(10):
    print(sorted_strf_counts[i])

In [None]:
# strptime should be the same with a slightly longer regex
pattern = "strptime\\(.*?,\\s*[\"\'](.*?)[\"\']\\s*\\)"
print(pattern)
strp_counts = dict()
for i,row in df.iterrows():
    if row["pattern"] == "strptime":
        match = re.search(pattern, row["text"])
        if match is not None:
            strp_counts[match.group(1)] = strp_counts.get(match.group(1), 0) + 1

sorted_strp_counts = sorted(list(zip(strp_counts.values(), strp_counts.keys())), reverse=True)
for i in range(10):
    print(sorted_strp_counts[i])

In [None]:
# how ISO compliant are these format strings?
# It's weirdly hard to find a summary of ISO 8601 online or even a full version that doesn't cost >$100
# this is what I'm going off of https://www.w3.org/TR/NOTE-datetime
# 
def is_iso(fmt):
    components = ["%Y", "-%m", "-%d", "T", "%H", ":%M", ":%S", "\\.%f"]
    tzs = ["", "Z", "%:z"]
    for component in components:
        match = re.match(component, fmt)
        if match is None:
            return False
        else:
            fmt = fmt[match.end():]
        
        if fmt in tzs:
            return True
    return False

def is_almost_iso(fmt):
    components = ["%Y", "-%m", "-%d", ".", "%H", ":%M", ":%S", "\\.%f"]
    tzs = ["", "Z", "%:z"]
    for component in components:
        match = re.match(component, fmt)
        if match is None:
            return False
        else:
            fmt = fmt[match.end():]
        
        if fmt in tzs:
            return True
    return False
    
assert is_iso("%Y-%m-%dT%H")
assert not is_iso("%Y-%m-%d %H")
assert is_almost_iso("%Y-%m-%d %H")
assert not is_iso("%Y-%m-%d %H%M%S")
assert not is_iso("%Y-%m-%d %H:%")
assert not is_iso("%Y-%m-%d %H:%M:%S.%fffffffffff")

In [None]:
labels = ["ISO Compliant", "Semi-compliant", "Noncompliant"]
strf_x = [0,0,0]
strp_x = [0,0,0]

for (count,fmt) in sorted_strf_counts:
    if is_iso(fmt):
        strf_x[0] += count
    elif is_almost_iso(fmt):
        strf_x[1] += count
    else:
        strf_x[2] += count

for (count,fmt) in sorted_strp_counts:
    if is_iso(fmt):
        strp_x[0] += count
    elif is_almost_iso(fmt):
        strp_x[1] += count
    else:
        strp_x[2] += count

    
fig = plt.figure(figsize=(12,4))
colors = get_even_colors("twilight", 3)

ax = fig.add_subplot(121)
ax.set_title("ISO compliance in strftime")
ax.pie(x=strf_x, labels=labels, colors = colors)

ax = fig.add_subplot(122)
ax.set_title("ISO compliance in strptime")
ax.pie(x=strp_x, labels=labels, colors = colors)

"" 

In [None]:
# how often do people use each format directive
real_directives = ['%a', '%A', '%w', '%d', '%b', '%B', '%m', '%y', '%Y', '%H', '%I', '%p', '%M', '%S', '%f', '%z', '%Z', '%j', '%U', '%W', '%c', '%x', '%X', '%G', '%u', '%V', '%:z', '%%']
directives = {}
for (count, fmt) in sorted_strf_counts + sorted_strp_counts:
    for i in range(len(fmt)-1):
        if fmt[i] == "%":
            directive = fmt[i:i+2]
            directives[directive] = directives.get(directive,0)+1
sorted_directives = sorted(list(zip(directives.values(), directives.keys())), reverse=True)
for count,d in sorted_directives:
    if d in real_directives:
        print(f"{d}: {count}")
    else:
        print(f"{d}: {count:<10} (invalid)")

In [None]:
# how often do people include timezone information in their format strings
# just checking for "z" actually seems pretty good because they're either
# * using Z because the datetime is always going to be UTC
# * using %Z, %z or %:z which records the timezone of the datetime
# * using a word which contains "z" in it, which is very unlikely
tzcount = 0
nontzcount = 0
for count,d in sorted_strf_counts + sorted_strp_counts:
    if "z" in d.lower():
        tzcount += 1
    else:
        nontzcount += 1

print(f"{tzcount}, {100*tzcount/(nontzcount+tzcount):.4}%")