# Post-processing the Data.

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/bugs_analysis_base.tsv', sep='\t')

# Filter out rows that are not relevant
df = df[(df['Training?'] == True) | (df['Testing?'] == True) | (df['Set 3?'] == True)]
print("Total number of datapoints: ", len(df))

df = df[df['Skipped?'] != True]
print("Total number of datapoints: ", len(df))

df = df[df['False pos?'] != True]
print("Total number of datapoints: ", len(df))

# Drop irrelevant columns
df = df.drop(columns=['Make Benchmark?', 'Associated Benchmark', 'Rater', 'Training?', 'Testing?', 'Set 3?', 'Real Testing?', 'Description', 'Comments', 'Skipped?', 'False pos?']) 
df.head()

Total number of datapoints:  261
Total number of datapoints:  252
Total number of datapoints:  151


Unnamed: 0,Owner,Project,Title,Link,Fix Link,Stars,TF-IDFs,Size,Datetime,Arrow,...,Erroneous Computation 1,Erroneous Computation 2,Erroneous Computation 3,Obscurity,Impact,# LOC,Logic Needed,Bug Pattern 1,Bug Pattern 2,Bug Pattern 3
0,python-poetry,tomlkit,datetime.utcnow and datetime.utcfromtimestamp ...,https://github.com/python-poetry/tomlkit/issue...,https://github.com/python-poetry/tomlkit/commi...,643.0,1.312623,1001.0,1.0,0.0,...,Datetime Construction,,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
1,frictionlessdata,frictionless-py,SpssParser ignores timezones,https://github.com/frictionlessdata/frictionle...,https://github.com/frictionlessdata/frictionle...,683.0,1.270077,1279.0,1.0,0.0,...,Datetime Construction,String Parsing/Formatting (data),,Low,Medium,Medium,High,Dropping Timezones,,
2,sdispater,pendulum,Deepcopy of Month-based Duration produces a di...,https://github.com/sdispater/pendulum/issues/714,https://github.com/sdispater/pendulum/commit/9...,6109.0,1.191295,1014.0,1.0,0.0,...,Delta Construction,,,Medium,Medium,Medium,Low,Unintuitive Arithmetic,,
3,googleapis,python-storage,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-storage/i...,https://github.com/googleapis/python-storage/c...,420.0,1.176351,10349.0,1.0,0.0,...,Datetime Comparison,,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
4,googleapis,python-logging,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-logging/i...,https://github.com/googleapis/python-logging/p...,119.0,1.167279,2673.0,1.0,0.0,...,Datetime Construction,,,Low,Low,Low,Low,Outdated/Deprecated APIs,,


In [4]:
#Write it to a file
df.to_csv('./data/bugs_analysis_base_processed.tsv', sep='\t', index=False)

In [None]:
# Code to merge the Erroneous computations

df = pd.read_csv('./data/bugs_analysis_base_processed.tsv', sep='\t')
print("Total number of datapoints: ", len(df))

affected_computations = """Timestamp/Hash Construction
Datetime Arithmetic
Datetime Comparison
Datetime Construction
Datetime Equality
Delta Arithmetic
Delta Comparison
Delta Construction
Querying Datetime Components
Replacing/Rounding Datetime Components
String Parsing/Formatting (humanized)
String Parsing/Formatting (data)
Timezone Conversions
Timezone Equality""".split("\n")

target_computations = """Timestamp/Hash Construction
Datetime Arithmetic
Datetime Construction
Delta Arithmetic
Delta Construction
Querying Datetime Components
Replacing/Rounding Datetime Components
String Parsing/Formatting
Timezone Comparison and Conversions""".split("\n")

for index, value in df['Category 1'].items():
    print(f"Index: {index}, Value: {value}")

    if (value == "Date"):
        df.at[index, 'Category 1'] = "Calendar Date"
    if (value == "Timezone"):
        df.at[index, 'Category 1'] = "Time Zone"
    if (value == "Deprecated"):
        df.at[index, 'Category 1'] = "Other"

for index, value in df['Category 2'].items():
    print(f"Index: {index}, Value: {value}")

    if (value == "Date"):
        df.at[index, 'Category 2'] = "Calendar Date"
    if (value == "Timezone"):
        df.at[index, 'Category 2'] = "Time Zone"
    if (value == "Deprecated"):
        df.at[index, 'Category 2'] = "Other"

for index, value in df['Erroneous Computation 1'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Timestamp/Hash Construction"):
        df.at[index, 'Erroneous Computation 1'] = "Timestamp API"

    if (value == "String Parsing/Formatting (humanized)") or (value == "String Parsing/Formatting (data)"):
        df.at[index, 'Erroneous Computation 1'] = "Parsing/Formatting"

    if (value == "Datetime Comparison") or (value == "Datetime Equality") or (value == "Datetime Arithmetic"):
        df.at[index, 'Erroneous Computation 1'] = "Datetime (Arithmetic)"
    
    if (value == "Datetime Construction"):
        df.at[index, 'Erroneous Computation 1'] = "Datetime (Construction)"
    
    if (value == "Delta Comparison") or (value == "Delta Arithmetic") or (value == "Delta Construction"):
        df.at[index, 'Erroneous Computation 1'] = "Delta API"
    
    if (value == "Timezone Conversions") or (value == "Timezone Equality"):
        df.at[index, 'Erroneous Computation 1'] = "Time Zone Manipulation"

    if (value == "Querying Datetime Components") or (value == "Replacing/Rounding Datetime Components"):
        df.at[index, 'Erroneous Computation 1'] = "Datetime (Projection)"

for index, value in df['Erroneous Computation 2'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Timestamp/Hash Construction"):
        df.at[index, 'Erroneous Computation 2'] = "Timestamp API"

    if (value == "String Parsing/Formatting (humanized)") or (value == "String Parsing/Formatting (data)"):
        df.at[index, 'Erroneous Computation 2'] = "Parsing/Formatting"

    if (value == "Datetime Comparison") or (value == "Datetime Equality") or (value == "Datetime Arithmetic"):
        df.at[index, 'Erroneous Computation 2'] = "Datetime (Arithmetic)"
    
    if (value == "Datetime Construction"):
        df.at[index, 'Erroneous Computation 2'] = "Datetime (Construction)"
    
    if (value == "Delta Comparison") or (value == "Delta Arithmetic") or (value == "Delta Construction"):
        df.at[index, 'Erroneous Computation 2'] = "Delta API"
    
    if (value == "Timezone Conversions") or (value == "Timezone Equality"):
        df.at[index, 'Erroneous Computation 2'] = "Time Zone Manipulation"

    if (value == "Querying Datetime Components") or (value == "Replacing/Rounding Datetime Components"):
        df.at[index, 'Erroneous Computation 2'] = "Datetime (Projection)"

for index, value in df['Erroneous Computation 3'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Timestamp/Hash Construction"):
        df.at[index, 'Erroneous Computation 3'] = "Timestamp API"

    if (value == "String Parsing/Formatting (humanized)") or (value == "String Parsing/Formatting (data)"):
        df.at[index, 'Erroneous Computation 3'] = "Parsing/Formatting"

    if (value == "Datetime Comparison") or (value == "Datetime Equality") or (value == "Datetime Arithmetic"):
        df.at[index, 'Erroneous Computation 3'] = "Datetime (Arithmetic)"
    
    if (value == "Datetime Construction"):
        df.at[index, 'Erroneous Computation 3'] = "Datetime (Construction)"
    
    if (value == "Delta Comparison") or (value == "Delta Arithmetic") or (value == "Delta Construction"):
        df.at[index, 'Erroneous Computation 3'] = "Delta API"
    
    if (value == "Timezone Conversions") or (value == "Timezone Equality"):
        df.at[index, 'Erroneous Computation 3'] = "Time Zone Manipulation"

    if (value == "Querying Datetime Components") or (value == "Replacing/Rounding Datetime Components"):
        df.at[index, 'Erroneous Computation 3'] = "Datetime (Projection)"



for index, value in df['Bug Pattern 1'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Compatibility Issues"):
        df.at[index, 'Bug Pattern 1'] = "General"
    if (value == "Incorrect API usage"):
        df.at[index, 'Bug Pattern 1'] = "Incorrect API Usage"
    if (value == "Comparing Timezones"):
        df.at[index, 'Bug Pattern 1'] = "Comparing Time Zones"
    if (value == "Creating time in unspecified fold"):
        df.at[index, 'Bug Pattern 1'] = "Unspecified Fold"
    if (value == "Dropping Timezones"):
        df.at[index, 'Bug Pattern 1'] = "Dropping Time Zone Information"

for index, value in df['Bug Pattern 2'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Compatibility Issues"):
        df.at[index, 'Bug Pattern 2'] = "General"
    if (value == "Incorrect API usage"):
        df.at[index, 'Bug Pattern 2'] = "Incorrect API Usage"
    if (value == "Comparing Timezones"):
        df.at[index, 'Bug Pattern 2'] = "Comparing Time Zones"
    if (value == "Creating time in unspecified fold"):
        df.at[index, 'Bug Pattern 2'] = "Unspecified Fold"
    if (value == "Dropping Timezones"):
        df.at[index, 'Bug Pattern 2'] = "Dropping Time Zone Information"

for index, value in df['Bug Pattern 3'].items():
    print(f"Index: {index}, Value: {value}")
    if (value == "Compatibility Issues"):
        df.at[index, 'Bug Pattern 3'] = "General"
    if (value == "Incorrect API usage"):
        df.at[index, 'Bug Pattern 3'] = "Incorrect API Usage"
    if (value == "Comparing Timezones"):
        df.at[index, 'Bug Pattern 3'] = "Comparing Time Zones"
    if (value == "Creating time in unspecified fold"):
        df.at[index, 'Bug Pattern 3'] = "Unspecified Fold"
    if (value == "Dropping Timezones"):
        df.at[index, 'Bug Pattern 3'] = "Dropping Time Zone Information"

df.head()


Total number of datapoints:  151
Index: 0, Value: Deprecated
Index: 1, Value: Timezone
Index: 2, Value: Duration
Index: 3, Value: Deprecated
Index: 4, Value: Deprecated
Index: 5, Value: Timezone
Index: 6, Value: Deprecated
Index: 7, Value: Deprecated
Index: 8, Value: Deprecated
Index: 9, Value: Timezone
Index: 10, Value: Timezone
Index: 11, Value: Date
Index: 12, Value: Deprecated
Index: 13, Value: Date
Index: 14, Value: Timezone
Index: 15, Value: DST
Index: 16, Value: DST
Index: 17, Value: Deprecated
Index: 18, Value: Duration
Index: 19, Value: Deprecated
Index: 20, Value: Date
Index: 21, Value: Timezone
Index: 22, Value: Timezone
Index: 23, Value: Duration
Index: 24, Value: Timezone
Index: 25, Value: Timezone
Index: 26, Value: Duration
Index: 27, Value: Timezone
Index: 28, Value: Timestamps
Index: 29, Value: Deprecated
Index: 30, Value: Timezone
Index: 31, Value: Deprecated
Index: 32, Value: Deprecated
Index: 33, Value: String Representation
Index: 34, Value: Timezone
Index: 35, Valu

Unnamed: 0,Owner,Project,Title,Link,Fix Link,Stars,TF-IDFs,Size,Datetime,Arrow,...,Erroneous Computation 1,Erroneous Computation 2,Erroneous Computation 3,Obscurity,Impact,# LOC,Logic Needed,Bug Pattern 1,Bug Pattern 2,Bug Pattern 3
0,python-poetry,tomlkit,datetime.utcnow and datetime.utcfromtimestamp ...,https://github.com/python-poetry/tomlkit/issue...,https://github.com/python-poetry/tomlkit/commi...,643.0,1.312623,1001.0,1.0,0.0,...,Datetime (Construction),,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
1,frictionlessdata,frictionless-py,SpssParser ignores timezones,https://github.com/frictionlessdata/frictionle...,https://github.com/frictionlessdata/frictionle...,683.0,1.270077,1279.0,1.0,0.0,...,Datetime (Construction),Parsing/Formatting,,Low,Medium,Medium,High,Dropping Time Zone Information,,
2,sdispater,pendulum,Deepcopy of Month-based Duration produces a di...,https://github.com/sdispater/pendulum/issues/714,https://github.com/sdispater/pendulum/commit/9...,6109.0,1.191295,1014.0,1.0,0.0,...,Delta API,,,Medium,Medium,Medium,Low,Unintuitive Arithmetic,,
3,googleapis,python-storage,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-storage/i...,https://github.com/googleapis/python-storage/c...,420.0,1.176351,10349.0,1.0,0.0,...,Datetime (Arithmetic),,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
4,googleapis,python-logging,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-logging/i...,https://github.com/googleapis/python-logging/p...,119.0,1.167279,2673.0,1.0,0.0,...,Datetime (Construction),,,Low,Low,Low,Low,Outdated/Deprecated APIs,,


In [6]:
# Write it to a file
df.to_csv('./data/bugs_analysis_base_processed+grouped.tsv', sep='\t', index=False)

In [7]:
df = pd.read_csv('./data/bugs_analysis_rater_1.tsv', sep='\t')

# Filter out rows that are not relevant
df = df[(df['Testing?'] == True)]

# Drop irrelevant columns
df = df.drop(columns=['Make Benchmark?', 'Associated Benchmark', 'Training?', 'Testing?', 'Set 3?', 'Real Testing?', 'Description', 'Comments']) 

print("Total number of datapoints: ", len(df))
df.head()

Total number of datapoints:  60


Unnamed: 0,Rater,Skipped?,False pos?,Owner,Project,Title,Link,Fix Link,Stars,TF-IDFs,...,Erroneous Computation 1,Erroneous Computation 2,Erroneous Computation 3,Obscurity,Impact,# LOC,Logic Needed,Bug Pattern 1,Bug Pattern 2,Bug Pattern 3
1,5: Peter Sasha,False,False,frictionlessdata,frictionless-py,SpssParser ignores timezones,https://github.com/frictionlessdata/frictionle...,https://github.com/frictionlessdata/frictionle...,683.0,1.270077,...,Datetime Construction,String Parsing/Formatting (data),,Medium,Low,Medium,Medium,Dropping Timezones,,
3,2: Shrey Serena,False,False,googleapis,python-storage,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-storage/i...,https://github.com/googleapis/python-storage/c...,420.0,1.176351,...,Datetime Construction,,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
10,3: Shrey Sasha,False,True,sdispater,pendulum,DateTime.add(duration),https://github.com/sdispater/pendulum/issues/323,https://github.com/gridsingularity/gsy-e/pull/...,6109.0,0.996023,...,,,,,,,,,,
12,2: Shrey Serena,False,False,Bears-R-Us,arkouda,datetime columns in dataframe display bug,https://github.com/Bears-R-Us/arkouda/issues/2596,https://github.com/stress-tess/arkouda/commit/...,228.0,0.947325,...,Datetime Construction,,,Medium,Low,Low,Low,Incorrect API usage,,
14,5: Peter Sasha,False,False,dateutil,dateutil,last day of the month is wrong??,https://github.com/dateutil/dateutil/issues/1167,https://github.com/dateutil/dateutil/pull/1168,2266.0,0.935691,...,Datetime Arithmetic,,,Low,Low,Medium,Medium,Unintuitive Arithmetic,,


In [8]:
# write it to a file
df.to_csv('./data/bugs_analysis_rater_1.tsv', sep='\t', index=False)

In [9]:
df = pd.read_csv('./data/bugs_analysis_rater_2.tsv', sep='\t')

# Filter out rows that are not relevant
df = df[(df['Testing?'] == True)]

# Drop irrelevant columns
df = df.drop(columns=['Make Benchmark?', 'Associated Benchmark', 'Training?', 'Testing?', 'Set 3?', 'Real Testing?', 'Description', 'Comments']) 

print("Total number of datapoints: ", len(df))
df.head()

Total number of datapoints:  60


Unnamed: 0,Rater,Skipped?,False pos?,Owner,Project,Title,Link,Fix Link,Stars,TF-IDFs,...,Erroneous Computation 1,Erroneous Computation 2,Erroneous Computation 3,Obscurity,Impact,# LOC,Logic Needed,Bug Pattern 1,Bug Pattern 2,Bug Pattern 3
1,5: Peter Sasha,False,False,frictionlessdata,frictionless-py,SpssParser ignores timezones,https://github.com/frictionlessdata/frictionle...,https://github.com/frictionlessdata/frictionle...,683.0,1.270077,...,Datetime Construction,Replacing/Rounding Datetime Components,Library Conversions,Low,Medium,Medium,Medium,Using Naïve Datetime Incorrectly,,
3,2: Shrey Serena,False,False,googleapis,python-storage,`DeprecationWarning` warning in build log for ...,https://github.com/googleapis/python-storage/i...,https://github.com/googleapis/python-storage/c...,420.0,1.176351,...,Datetime Construction,,,Low,Low,Low,Low,Outdated/Deprecated APIs,,
10,3: Shrey Sasha,False,True,sdispater,pendulum,DateTime.add(duration),https://github.com/sdispater/pendulum/issues/323,https://github.com/gridsingularity/gsy-e/pull/...,6109.0,0.996023,...,,,,,,,,,,
12,2: Shrey Serena,False,True,Bears-R-Us,arkouda,datetime columns in dataframe display bug,https://github.com/Bears-R-Us/arkouda/issues/2596,https://github.com/stress-tess/arkouda/commit/...,228.0,0.947325,...,,,,,,,,,,
14,5: Peter Sasha,False,False,dateutil,dateutil,last day of the month is wrong??,https://github.com/dateutil/dateutil/issues/1167,https://github.com/dateutil/dateutil/pull/1168,2266.0,0.935691,...,Datetime Arithmetic,,,Medium,Medium,Medium,Medium,Unintuitive Arithmetic,,


In [10]:
# write it to a file
df.to_csv('./data/bugs_analysis_rater_2.tsv', sep='\t', index=False)