# Performing Sanity Checks on the Analyzed Data of Datetime Bugs.

In [103]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/bugs_analysis_base.tsv', sep='\t')

# Filter out rows that are not relevant
df = df[df['Skipped?'] != True]
df = df[df['False pos?'] != True]
df = df[(df['Training?'] == True) | (df['Testing?'] == True) | (df['Set 3?'] == True)]

# Temp skip
# df = df[df['Real Testing?'] == True]
df = df[~((df['Set 3?'] == True) & (df['Rater'] == "2: Shrey Serena"))]

# Drop irrelevant columns
df = df.drop(columns=['Fix Link', 'Make Benchmark?', 'Associated Benchmark', 'Rater', 'Training?', 'Testing?', 'Skipped?', 'False pos?', 'Set 3?', 'Real Testing?']) 

print("Total number of datapoints: ", len(df))
df.head()

Total number of datapoints:  145


Unnamed: 0,Owner,Project,Title,Link,Stars,TF-IDFs,Size,Datetime,Arrow,Pendulum,...,Affected Computation 2,Affected Computation 3,Obscurity,Impact/Severity,# LOC,Logic Needed,Comments,BPST1,BPST2,BPST3
0,python-poetry,tomlkit,datetime.utcnow and datetime.utcfromtimestamp ...,https://github.com/python-poetry/tomlkit/issue...,643.0,1.312623,1001.0,1.0,0.0,0.0,...,,,Low,Low,Low,Low,,Outdated/Deprecated APIs,,
1,frictionlessdata,frictionless-py,SpssParser ignores timezones,https://github.com/frictionlessdata/frictionle...,683.0,1.270077,1279.0,1.0,0.0,0.0,...,String Parsing/Formatting (data),,Low,Medium,Medium,High,,Dropping Timezones,,
2,sdispater,pendulum,Deepcopy of Month-based Duration produces a di...,https://github.com/sdispater/pendulum/issues/714,6109.0,1.191295,1014.0,1.0,0.0,1.0,...,,,Medium,Medium,Medium,Low,,Unintuitive Arithmetic,,
3,googleapis,python-storage,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-storage/i...,420.0,1.176351,10349.0,1.0,0.0,0.0,...,,,Low,Low,Low,Low,,Outdated/Deprecated APIs,,
4,googleapis,python-logging,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-logging/i...,119.0,1.167279,2673.0,1.0,0.0,0.0,...,,,Low,Low,Low,Low,,Outdated/Deprecated APIs,,


In [96]:
df.columns

Index(['Rater', 'False pos?', 'Owner', 'Project', 'Title', 'Link', 'Fix Link',
       'Stars', 'TF-IDFs', 'Size', 'Datetime', 'Arrow', 'Pendulum',
       'Description', 'Category 1', 'Category 2', 'Affected Computation 1',
       'Affected Computation 2', 'Affected Computation 3', 'Obscurity',
       'Impact/Severity', '# LOC', 'Logic Needed', 'Comments', 'BPST1',
       'BPST2', 'BPST3'],
      dtype='object')

In [97]:
# Sanity Check: Make sure that the stars are > 100

try:
    assert (df['Stars'] > 100).all()
except AssertionError:
    print("Some stars are less than 100")
    print(df[df['Stars'] < 100])

In [98]:
# Sanity Check: Make sure there are no duplicates

try:
    # Assert that there are no NaN values in the "Link" column
    assert df['Link'].notnull().all(), "There are NaN entries in the 'Link' column."

    # Assert that there are no duplicates
    assert not df['Link'].duplicated().any(), "There are duplicate entries in the 'Link' column"

except Exception as e:
    print("Error: ", e)

    duplicates = df[df.duplicated('Link', keep=False)]  # keep=False to mark all occurrences
    duplicate_count = duplicates['Link'].value_counts()
    print("Duplicate entries found in the 'Link' column:")
    print(duplicates["Title"].to_string(index=False))
    print(duplicates["Link"].to_string(index=False))

In [99]:
# Sanity Check: Make sure all the values in the "Category" column are in the specified list

categories = """Date
DST
Duration
String Representation
Timestamps
Timezone
Deprecated""".split("\n")

# print("Categories: ", categories)

try:
    # Assert that there are no NaN values in the "Category 1" column
    assert df['Category 1'].notnull().all(), "There are NaN entries in the 'Category 1' column."

    # Assert that all values in the "Category 1" column are in the specified list
    assert df['Category 1'].isin(categories).all(), "There are values in the 'Category 1' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_categories = df[~df['Category 1'].isin(categories)]
    invalid_category_count = invalid_categories['Category 1'].value_counts()
    # print("Invalid categories found in the 'Category 1' column:")
    # print(invalid_categories["Title"].to_string(index=False))
    # print(invalid_categories["Category 1"].to_string(index=False))


categories.append(np.nan)
# print("Categories: ", categories)

try:
    # Assert that all values in the "Category 2" column are in the specified list
    assert df['Category 2'].isin(categories).all(), "There are values in the 'Category 2' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_categories = df[~df['Category 2'].isin(categories)]
    invalid_category_count = invalid_categories['Category 2'].value_counts()
    # print("Invalid categories found in the 'Category 2' column:")
    # print(invalid_categories["Title"].to_string(index=False))
    # print(invalid_categories["Category 2"].to_string(index=False))


Error:  There are NaN entries in the 'Category 1' column.


In [100]:
# Sanity Check: Make sure all the values in the "Affected Computation" column are in the specified list

affected_computations = """Timestamp/Hash Construction
Datetime Arithmetic
Datetime Comparison
Datetime Construction
Datetime Equality
Delta Arithmetic
Delta Comparison
Delta Construction
Querying Datetime Components
Replacing/Rounding Datetime Components
String Parsing/Formatting (humanized)
String Parsing/Formatting (data)
Timezone Conversions
Timezone Equality
Library Conversions""".split("\n")

# print("Affected Computations: ", affected_computations)

try:
    # Assert that there are no NaN values in the "Affected Computation 1" column
    assert df['Affected Computation 1'].notnull().all(), "There are NaN entries in the 'Affected Computation 1' column."

    # Assert that all values in the "Affected Computation" column are in the specified list
    assert df['Affected Computation 1'].isin(affected_computations).all(), "There are values in the 'Affected Computation 1' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_affected_computations = df[~df['Affected Computation 1'].isin(affected_computations)]
    invalid_affected_computation_count = invalid_affected_computations['Affected Computation 1'].value_counts()
    # print("Invalid affected computations found in the 'Affected Computation 1' column:")
    # print(invalid_affected_computations["Title"].to_string(index=False))
    # print(invalid_affected_computations["Affected Computation 1"].to_string(index=False))

affected_computations.append(np.nan)
# print("Affected Computations: ", affected_computations)

try:
    # Assert that all values in the "Affected Computation 2" column are in the specified list
    assert df['Affected Computation 2'].isin(affected_computations).all(), "There are values in the 'Affected Computation 2' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_affected_computations = df[~df['Affected Computation 2'].isin(affected_computations)]
    invalid_affected_computation_count = invalid_affected_computations['Affected Computation 2'].value_counts()
    # print("Invalid affected computations found in the 'Affected Computation 2' column:")
    # print(invalid_affected_computations["Title"].to_string(index=False))
    # print(invalid_affected_computations["Affected Computation 2"].to_string(index=False))

try:
    # Assert that all values in the "Affected Computation 3" column are in the specified list
    assert df['Affected Computation 3'].isin(affected_computations).all(), "There are values in the 'Affected Computation 3' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_affected_computations = df[~df['Affected Computation 3'].isin(affected_computations)]
    invalid_affected_computation_count = invalid_affected_computations['Affected Computation 3'].value_counts()
    # print("Invalid affected computations found in the 'Affected Computation 3' column:")
    # print(invalid_affected_computations["Title"].to_string(index=False))
    # print(invalid_affected_computations["Affected Computation 3"].to_string(index=False))

Error:  There are NaN entries in the 'Affected Computation 1' column.


In [101]:
# Sanity Check: Make sure all the values in the "BPST" column are in the specified list

bpst = """Incorrect API usage
Outdated/Deprecated APIs
Typo
Using Naïve Datetime Incorrectly
Dropping Timezones
Precision of Representation
Library Bug
OOB Datetimes or Timestamps
Creating time in unspecified fold
Comparing Timezones
Unintuitive Arithmetic
Compatibility Issues
Incorrect Format Assumptions""".split("\n")

# print("BPST: ", bpst)

try:
    # Assert that there are no NaN values in the "BPST1" column
    assert df['BPST1'].notnull().all(), "There are NaN entries in the 'BPST1' column."

    # Assert that all values in the "BPST1" column are in the specified list
    assert df['BPST1'].isin(bpst).all(), "There are values in the 'BPST1' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_bpst = df[~df['BPST1'].isin(bpst)]
    invalid_bpst_count = invalid_bpst['BPST1'].value_counts()
    # print("Invalid BPST found in the 'BPST1' column:")
    # print(invalid_bpst["Title"].to_string(index=False))
    # print(invalid_bpst["BPST1"].to_string(index=False))

bpst.append(np.nan)
# print("BPST: ", bpst)

try:
    # Assert that all values in the "BPST2" column are in the specified list
    assert df['BPST2'].isin(bpst).all(), "There are values in the 'BPST2' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_bpst = df[~df['BPST2'].isin(bpst)]
    invalid_bpst_count = invalid_bpst['BPST2'].value_counts()
    # print("Invalid BPST found in the 'BPST2' column:")
    # print(invalid_bpst["Title"].to_string(index=False))
    # print(invalid_bpst["BPST2"].to_string(index=False))

try:
    # Assert that all values in the "BPST3" column are in the specified list
    assert df['BPST3'].isin(bpst).all(), "There are values in the 'BPST3' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_bpst = df[~df['BPST3'].isin(bpst)]
    invalid_bpst_count = invalid_bpst['BPST3'].value_counts()
    # print("Invalid BPST found in the 'BPST3' column:")
    # print(invalid_bpst["Title"].to_string(index=False))
    # print(invalid_bpst["BPST3"].to_string(index=False))

Error:  There are NaN entries in the 'BPST1' column.


In [102]:
# Sanity Check: Make sure all the values in the "Obscurity", "Impact/Severity", "Logic Needed", "# LOC" column are in the specified list

levels = """Low
Medium
High""".split("\n")

# print("Levels: ", levels)

try:
    # Assert that there are no NaN values in the "Obscurity" column
    assert df['Obscurity'].notnull().all(), "There are NaN entries in the 'Obscurity' column."

    # Assert that all values in the "Obscurity" column are in the specified list
    assert df['Obscurity'].isin(levels).all(), "There are values in the 'Obscurity' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_levels = df[~df['Obscurity'].isin(levels)]
    invalid_levels_count = invalid_levels['Obscurity'].value_counts()
    # print("Invalid levels found in the 'Obscurity' column:")
    # print(invalid_levels["Title"].to_string(index=False))
    # print(invalid_levels["Obscurity"].to_string(index=False))

try:
    # Assert that there are no NaN values in the "Impact/Severity" column
    assert df['Impact/Severity'].notnull().all(), "There are NaN entries in the 'Impact/Severity' column."

    # Assert that all values in the "Impact/Severity" column are in the specified list
    assert df['Impact/Severity'].isin(levels).all(), "There are values in the 'Impact/Severity' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_levels = df[~df['Impact/Severity'].isin(levels)]
    invalid_levels_count = invalid_levels['Impact/Severity'].value_counts()
    # print("Invalid levels found in the 'Impact/Severity' column:")
    # print(invalid_levels["Title"].to_string(index=False))
    # print(invalid_levels["Impact/Severity"].to_string(index=False))

try:
    # Assert that there are no NaN values in the "Logic Needed" column
    assert df['Logic Needed'].notnull().all(), "There are NaN entries in the 'Logic Needed' column."

    # Assert that all values in the "Logic Needed" column are in the specified list
    assert df['Logic Needed'].isin(levels).all(), "There are values in the 'Logic Needed' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_levels = df[~df['Logic Needed'].isin(levels)]
    invalid_levels_count = invalid_levels['Logic Needed'].value_counts()
    # print("Invalid levels found in the 'Logic Needed' column:")
    # print(invalid_levels["Title"].to_string(index=False))
    # print(invalid_levels["Logic Needed"].to_string(index=False))

levels.append("Very High")
# print("Levels: ", levels)

try:
    # Assert that there are no NaN values in the "# LOC" column
    assert df['# LOC'].notnull().all(), "There are NaN entries in the '# LOC' column."
    
    # Assert that all values in the "# LOC" column are in the specified list
    assert df['# LOC'].isin(levels).all(), "There are values in the '# LOC' column that are not in the specified list."
except Exception as e:
    print("Error: ", e)

    invalid_levels = df[~df['# LOC'].isin(levels)]
    invalid_levels_count = invalid_levels['# LOC'].value_counts()
    # print("Invalid levels found in the '# LOC' column:")
    # print(invalid_levels["Title"].to_string(index=False))
    # print(invalid_levels["# LOC"].to_string(index=False))

Error:  There are NaN entries in the 'Obscurity' column.
Error:  There are NaN entries in the 'Impact/Severity' column.
Error:  There are NaN entries in the 'Logic Needed' column.
Error:  There are NaN entries in the '# LOC' column.
