In [346]:
import pandas as pd
import numpy as np
import re, math
from string import punctuation

In [347]:
df = pd.read_excel("./data/Eni_Shell_data.xlsx")

In [348]:
df.shape

(1586, 46)

In [349]:
df.columns

Index(['Oil Spill ID', 'Company', 'JIV Number', 'Date Reported (Shell, Eni)',
       'Year', 'JIV Date (Shell)',
       'Facility equipment/ Incident site (Shell, Eni)',
       'Area/ Terrain (Shell, Eni)', 'Cause (Shell, Eni)',
       'Volume (Shell, Eni) barrels\n', 'Clean-up Status (Shell)',
       'Comments (Shell)', 'LGA (Eni)', 'JIV Asset Id', 'Included', 'JIV Url',
       'JIV URL hyperlinked', 'JIV Cause Verified', 'Date Incident',
       'Date start investigation', 'Description of Leak Point',
       'Incident caused by', 'Incident caused by (Clean)',
       'Comments cause (amnesty classification)',
       'Cause (amnesty classification)', 'JIV Location Verified',
       'Location.Type', 'Location Unit', 'Lat/Northing', 'Long/Easting',
       'Tranformation notes', 'Latitude (normalised)',
       'Longitude (normalised)', 'Area', 'AreaUnit', 'JIV Comment Type',
       'JIV Comment', 'photo_lookup_id', 'Photo Asset Id', 'Included.1',
       'Photo Url', 'Photo url hyperlinked'

### I. Nornalize column names

In [350]:
column_names = [
    "oil_spill_id",
    "company", 
    "jiv_number",
    "date_reported",
    "year",
    "date_jiv_shell",
    "facility_equipment",
    "terrain",
    "cause",
    "barrels",
    "cleanup_status_text",
    "comments_shell",
    "lga_eni",
    "jiv_asset_id",
    "in_decoders",
    "jiv_url",
    "jiv_url_hyperlinked",
    "cause_jiv_verified",
    "date_incident",
    "date_investigation_start",
    "leak_point_text",
    "cause_incident_caused_by_dirty",
    "cause_incident_caused_by",
    "cause_amnesty_comment",
    "cause_amnesty",
    "location_jiv_verified",
    "location_type",
    "location_unit",
    "lat_northing",
    "long_eastling",
    "location_transformation_notes",
    "lat",
    "long",
    "area_decoders",
    "area_unit",
    "comment_type_jiv",
    "comment_jiv",
    "photo_lookup_id",
    "photo_asset_id",
    "in_decoders2",
    "photo_url",
    "photo_url_hyperlinked",
    "damage_photo",
    "damage_photo_followup",
    "comment_jiv_duplicate",
    "comment_jiv_text"
]

In [351]:
pd.DataFrame({"Original Column Names": df.columns, "Alias": column_names}).to_csv("columns.csv")

In [352]:
df.columns.shape, len(column_names)

((46,), 46)

In [353]:
df.columns = column_names

### II. Explore data

#### 1. Do "Included" and "Included.1" hold the same data?

In [354]:
df[["in_decoders", "in_decoders2"]].head()

Unnamed: 0,in_decoders,in_decoders2
0,Y,Y
1,Y,Y
2,N,Y
3,N,Y
4,Y,Y


In [355]:
df.in_decoders2.value_counts()

Y                                       1558
N (asset error on shell/eni wensite)      13
Missing photo                              8
N                                          5
N, photo missing                           2
Name: in_decoders2, dtype: int64

#### 2. explore Facility Equiptment

- '' inches
- pipeline name
- "at" location

In [356]:
df.facility_equipment.head(5)

0                            24'' Ogoda/Brass Pipeline
1    12'' Imo River 1 and 2 - Ogale Pipeline at komkom
2              Adibawa Well 8 S/L Wellhead at Edagberi
3           24'' Bomu - Bonny Pipeline at Okolo Launch
4       10'' Diebu Creek-Nun River Pipeline at Oporoma
Name: facility_equipment, dtype: object

In [357]:
# regexs
inch_single_quote_regex = re.compile(r"(\d+)''")
inch_double_quote_regex = re.compile(r'(\d+)"')
location_regex = re.compile(r"at\s(.*)")

inches = [np.nan] * df.shape[0]
facility_type_name = [np.nan] * df.shape[0]
facility_location = [np.nan] * df.shape[0]

no_inch_cnt = 0
missing_loc_count = 0
for i in range(df.shape[0]):
    
    facility_info = df.facility_equipment.iloc[i].lower()
    # a. exctract inches
    try:
        inches[i] = int(re.search(inch_single_quote_regex, facility_info).group(1))
    except:
        try:
            inches[i] = int(re.search(inch_double_quote_regex, facility_info).group(1))
        except:
            no_inch_cnt += 1
            
    # b. extract facility type
    type_found = False
    otherline_seen = "line" in facility_info
    flowlines = set(["flowline", "fl"])
    flowline_seen = flowlines.intersection(set(facility_info.split(" "))) or "flow line" in facility_info
    pipeline_seen = "pipeline" in facility_info
    well_seen = "well" in facility_info
    wellhead_seen = "wellhead" in facility_info or "well head" in facility_info
    manifold_seen = "manifold" in facility_info
    trunklines = set(["trunkline", "tl"])
    trunkline_seen = trunklines.intersection(set(facility_info.split(" "))) or "trunk line" in facility_info
    deliverylines = set(["deliveryline", "dl"])
    deliveryline_seen = deliverylines.intersection(set(facility_info.split(" "))) or "delivery line" in facility_info
    bulklines = set(["bulkline", "bl"])
    bulkline_seen = bulklines.intersection(set(facility_info.split(" "))) or "bulk line" in facility_info
    flowstation_seen = "flowstation" in facility_info or "flow station" in facility_info
 
    if otherline_seen:
        facility_type_name[i] = "other line"
        type_found = True
    
    if pipeline_seen and not well_seen:
        facility_type_name[i] = "pipeline"
        type_found = True
       
    if flowline_seen and "well" in facility_info:
        facility_type_name[i] = "flowline, well"
        type_found = True
        
    if flowline_seen and not well_seen:
        facility_type_name[i] = "flowline"
        type_found = True
        
    if well_seen and not wellhead_seen and not flowline_seen and not pipeline_seen:
        facility_type_name[i] = "well"
        type_found = True
        
    if wellhead_seen:
        facility_type_name[i] = "wellhead"
        type_found = True
        
    if manifold_seen:
        facility_type_name[i] = "manifold"
        type_found = True

    if trunkline_seen:
        facility_type_name[i] = "trunkline"
        type_found = True

    if deliveryline_seen:
        facility_type_name[i] = "deliveryline"
        type_found = True
        
    if bulkline_seen:
        facility_type_name[i] = "bulkline"
        type_found = True
    
    if flowstation_seen:
        facility_type_name[i] = "flowstation"
        type_found = True
    
    if not type_found:
        facility_type_name[i] = "other"
        
    # c. extract location
    try:
        facility_location[i] = str(re.search(location_regex, facility_info).group(1)).strip(punctuation)
    except:
        missing_loc_count +=1

df["inches"] = pd.Series(inches)
df["facility_type"] = pd.Series(facility_type_name)
df["facility_location"] = pd.Series(facility_location)

In [358]:
df.facility_type.value_counts()

pipeline          933
flowline, well    179
flowline          149
wellhead           64
well               52
other              46
trunkline          45
deliveryline       45
manifold           31
flowstation        21
other line         14
bulkline            7
Name: facility_type, dtype: int64

In [359]:
df.facility_location.isnull().sum()

761

In [360]:
df[["facility_location", "lga_eni"]].head()

Unnamed: 0,facility_location,lga_eni
0,,Abua/Odual
1,komkom,
2,edagberi,
3,okolo launch,
4,oporoma,


#### * check if when Shell location is given, Eni is missing and vice versa

In [361]:
# shell location -> facility_location
# eni location -> lga_eni
shell_loc = [0] * df.shape[0]
eni_loc = [0] * df.shape[0]

for i in range(df.shape[0]):
    shell_cur_loc = df.facility_location.iloc[i]
    eni_curr_loc = df.lga_eni.iloc[i]
    if type(shell_cur_loc) != float:
        shell_loc[i] = 1
    if type(eni_curr_loc) != float:
        eni_loc[i] = 1
        
shell_loc_series = pd.Series(shell_loc)
eni_loc_series = pd.Series(eni_loc)
loc_series = shell_loc_series.add(eni_loc_series)
loc_series.value_counts()

1    1522
0      59
2       5
dtype: int64

In [362]:
df["location_joint"] = pd.Series(loc_series)

In [363]:
df[["oil_spill_id", "company","facility_location", "lga_eni"]][df.location_joint == 2]

Unnamed: 0,oil_spill_id,company,facility_location,lga_eni
752,385,NAOC,the mini manifold,Southern Ijaw
1156,771,NAOC,olugbobiri,Southern Ijaw
1363,703,NAOC,obama 3 deep slot,Southern Ijaw
1372,1494,NAOC,oshie flow station,Oshie
1373,1202,NAOC,obiafu 14 well location,Ogba/Ndoni/Egbema


In [364]:
df[["oil_spill_id", "company","facility_location", "lga_eni"]][df.location_joint == 1].head()

Unnamed: 0,oil_spill_id,company,facility_location,lga_eni
0,1272,NAOC,,Abua/Odual
1,221,SPDC,komkom,
2,640,SPDC,edagberi,
3,1109,SPDC,okolo launch,
4,486,SPDC,oporoma,


#### * clean the facility_locations for ENI

In [365]:
df.facility_location[df.location_joint == 2] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [366]:
df[["oil_spill_id", "company","facility_location", "lga_eni"]][df.location_joint == 2]

Unnamed: 0,oil_spill_id,company,facility_location,lga_eni
752,385,NAOC,,Southern Ijaw
1156,771,NAOC,,Southern Ijaw
1363,703,NAOC,,Southern Ijaw
1372,1494,NAOC,,Oshie
1373,1202,NAOC,,Ogba/Ndoni/Egbema


In [367]:
del df["location_joint"]

#### * aggregate Shell and Eni locations in the same column

In [368]:
locations = [np.nan] * df.shape[0]
for i in range(df.shape[0]):
    shell_loc = df.facility_location.iloc[i]
    eni_loc = df.lga_eni.iloc[i]
    if type(eni_loc) == float:
        locations[i] = shell_loc
    else:
        locations[i] = eni_loc
        
df['location'] = pd.Series(locations).values

In [369]:
del df["lga_eni"]
del df["facility_location"]

#### * extract the pipeline name

In [370]:
# handles from and to loc in cases such as: "nun-river kolo creek" or "biedu - nun-river"
def adjust_splits(splitted_list):
    from_loc = ""
    to_loc = ""
    if splitted_list[1] == "river" or splitted_list[1] == "creek":
        from_loc = splitted_list[0] + " " + splitted_list[1]
        to_loc = " ".join(splitted_list[2:])
    else:
        from_loc = splitted_list[0]
        to_loc = " ".join(splitted_list[1:])
    return from_loc, to_loc

In [371]:
df["facility_equipment_lower"] = df.facility_equipment.str.lower()
facility_names = [np.nan] * df.shape[0]
facility_start = [np.nan] * df.shape[0]
facility_end = [np.nan] * df.shape[0]

missing_fac_name_cnt = 0
for i in range(df.shape[0]):
    facility = df.facility_equipment_lower.iloc[i]
    current_facility_type = df.facility_type.iloc[i]
    # string replacements: "kolocreek" for "kolo creek", and "imo river ii" for "imo river 2"
    facility = facility.replace("kolocreek", "kolo creek")
    facility = facility.replace("imo river ii", "imo river 2")
    facility = facility.replace("imo river i", "imo river 1")

        
    if current_facility_type != "other":
        current_inches = df.inches.iloc[i]
        curr_type__abbr = current_facility_type[:5]
        # find the start index of the substring
        if math.isnan(current_inches):
            # start extracting from index 0
            start_index = 0
        else:
            # start extracting from the index of the word after the first space
            start_index = facility.find(" ") + 1
        # find the end index of the substring
        end_index = facility.find(curr_type__abbr)
        if end_index != -1:
            current_facility_name = facility[start_index:end_index-1]
            current_facility_name = current_facility_name.replace("  ", " ") # replace double space with single
            facility_names[i] = current_facility_name.strip()
            
            if current_facility_type not in ["well", "wellhead", "flowline", "flowline, well"]:

                dash_split = current_facility_name.split("-")
                to_split = current_facility_name.split(" to ")
                baskslash_split = current_facility_name.split("/")

                from_loc = ""
                to_loc = ""
                if len(dash_split) > 1:
    #                 if current_facility_name == "nun-river - kolo creek":
    #                     print("yes", dash_split, list(map(str.strip, dash_split)))
                    from_loc, to_loc = adjust_splits(list(map(str.strip, dash_split)))

                if len(baskslash_split) > 1:
                    from_loc, to_loc = adjust_splits(list(map(str.strip, baskslash_split)))

                if len(to_split) > 1:
                    from_loc, to_loc = adjust_splits(list(map(str.strip, to_split)))


                facility_start[i] = from_loc
                facility_end[i] = to_loc
            
        else:
            missing_fac_name_cnt += 1
            
print("There are", missing_fac_name_cnt, "missing facility names.")
df["facility_name"] = pd.Series(facility_names).values
df["facility_start"] = pd.Series(facility_start).values
df["facility_end"] = pd.Series(facility_end).values

There are 61 missing facility names.


In [372]:
del df["facility_equipment_lower"]
df[["facility_equipment", "inches", "facility_name", "facility_start", "facility_end", "facility_type", "location"]].to_csv("./data/EniShell_transformed.csv")

### 3. Verify company

In [373]:
df.company.value_counts()

SPDC    865
NAOC    721
Name: company, dtype: int64

### 4. Inspect cause

#### * map Shell/Eni causes to 3 large categories

In [427]:
df.groupby( [ "company", "cause"] ).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,barrels,cleanup_status_text,comments_shell,...,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end,cause_mapped
company,cause,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
NAOC,Corrosion,16,16,16,16,0,16,16,16,0,0,...,16,7,7,8,16,16,16,4,4,16
NAOC,Drilled hole,24,24,24,24,0,24,24,24,0,0,...,23,13,13,7,24,22,23,4,4,24
NAOC,Equipment failure,61,61,61,61,0,61,61,61,0,0,...,53,52,52,25,61,59,50,25,25,61
NAOC,Hacksaw cut,106,106,106,106,0,106,106,106,0,0,...,12,42,42,42,106,103,104,27,27,106
NAOC,Hacksaw cut & explosive,4,4,4,4,0,4,4,4,0,0,...,1,2,2,4,4,4,4,4,4,4
NAOC,Hacksaw cut & fire,1,1,1,1,0,1,1,1,0,0,...,1,1,1,1,1,1,1,1,1,1
NAOC,Induced corrosion,4,4,4,4,0,4,4,4,0,0,...,4,4,4,4,4,4,4,0,0,4
NAOC,Oil theft,409,409,409,409,0,409,409,409,0,0,...,19,270,270,405,409,403,408,398,398,409
NAOC,Operational error,2,2,2,2,0,2,2,2,0,0,...,2,2,2,1,2,2,2,0,0,2
NAOC,Operational error/Oil theft,1,1,1,1,0,1,1,1,0,0,...,0,1,1,1,1,1,1,1,1,1


In [426]:
#### * fit Shell/Eni cause to 3 categories
sabotage_theft = ["Sabotage", "Sabotage/ Theft", "Hacksaw cut", "Vandalization", "Use of explosive", "Hacksaw cut & explosive", "Hacksaw cut & fire", "Oil theft", "Drilled hole"]
company_fault = ["Operational", "Equipment failure", "Corrosion", "Induced corrosion", "Operational error/Oil theft", "Structure failure", "Operational error" ]
other = ["Other", "Mystery Spill", "Road Traffic Accident", "Unknown"]

mapped_cuased = [""] * df.shape[0]
for i in range(df.shape[0]):
    reported_cause = df.cause.iloc[i]
    if reported_cause in sabotage_theft:
        mapped_cuased[i] = "sabotage/theft"
    if reported_cause in company_fault:
        mapped_cuased[i] = "company's fault"
    if reported_cause in other:
        mapped_cuased[i] = "other"
        
df["cause_mapped"] = pd.Series(mapped_cuased).values
df[:].groupby(["cause_mapped", "company"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
cause_mapped,company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
company's fault,NAOC,86,86,86,86,0,86,86,86,86,0,...,86,77,68,68,39,86,84,73,30,30
company's fault,SPDC,164,160,164,164,164,164,164,164,164,164,...,164,146,83,83,71,164,145,141,62,62
other,NAOC,4,4,4,4,0,4,4,4,4,0,...,4,3,3,3,3,4,4,2,2,2
other,SPDC,6,6,6,6,6,6,6,6,6,6,...,6,5,5,5,3,6,6,5,2,2
sabotage/theft,NAOC,631,631,631,631,0,631,631,631,631,0,...,631,107,384,384,508,631,619,616,477,477
sabotage/theft,SPDC,695,670,695,695,695,695,695,695,695,695,...,695,144,161,161,556,695,669,642,495,495


In [429]:
df[:].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Operational,76,76,76,76,0,76,76,76,76,0,...,76,71,61,61,31,76,75,63,26,26
NAOC,company's fault,Operational; Third party (theft),1,1,1,1,0,1,1,1,1,0,...,1,0,1,1,1,1,1,1,1,1
NAOC,company's fault,Third party (theft),2,2,2,2,0,2,2,2,2,0,...,2,0,2,2,2,2,2,2,1,1
NAOC,company's fault,Third party (undetermined),1,1,1,1,0,1,1,1,1,0,...,1,0,0,0,0,1,0,1,0,0
NAOC,company's fault,Undetermined,6,6,6,6,0,6,6,6,6,0,...,6,6,4,4,5,6,6,6,2,2
NAOC,other,Operational,1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
NAOC,other,Third party (accident),1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,1,1,1,0,0,0
NAOC,other,Third party (undetermined),1,1,1,1,0,1,1,1,1,0,...,1,0,0,0,1,1,1,1,1,1
NAOC,other,Undetermined,1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,0,1,1,0,0,0
NAOC,sabotage/theft,Operational,66,66,66,66,0,66,66,66,66,0,...,66,65,49,49,28,66,66,56,27,27


#### * compare Decoder's cause to 3 large categories

In [433]:
df.cause_amnesty.value_counts()

Third party (theft)                 772
Third party (undetermined)          312
Operational                         303
Undetermined                        195
Third party (accident)                3
Operational; Third party (theft)      1
Name: cause_amnesty, dtype: int64

In [437]:
df[df.cause_amnesty == "Operational"].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Operational,76,76,76,76,0,76,76,76,76,0,...,76,71,61,61,31,76,75,63,26,26
NAOC,other,Operational,1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
NAOC,sabotage/theft,Operational,66,66,66,66,0,66,66,66,66,0,...,66,65,49,49,28,66,66,56,27,27
SPDC,company's fault,Operational,152,152,152,152,152,152,152,152,152,152,...,152,138,76,76,67,152,134,132,59,59
SPDC,other,Operational,1,1,1,1,1,1,1,1,1,1,...,1,0,1,1,1,1,1,1,1,1
SPDC,sabotage/theft,Operational,7,7,7,7,7,7,7,7,7,7,...,7,6,3,3,1,7,7,4,1,1


In [434]:
df[df.cause_amnesty == "Undetermined"].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Undetermined,6,6,6,6,0,6,6,6,6,0,...,6,6,4,4,5,6,6,6,2,2
NAOC,other,Undetermined,1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,0,1,1,0,0,0
NAOC,sabotage/theft,Undetermined,40,40,40,40,0,40,40,40,40,0,...,40,35,19,19,15,40,38,38,11,11
SPDC,company's fault,Undetermined,7,3,7,7,7,7,7,7,7,7,...,7,7,3,3,1,7,6,5,1,1
SPDC,other,Undetermined,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,1,4,4,4,1,1
SPDC,sabotage/theft,Undetermined,137,124,137,137,137,137,137,137,137,137,...,137,127,45,45,76,137,129,120,66,66


In [436]:
df[df.cause_amnesty == "Third party (undetermined)"].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Third party (undetermined),1,1,1,1,0,1,1,1,1,0,...,1,0,0,0,0,1,0,1,0,0
NAOC,other,Third party (undetermined),1,1,1,1,0,1,1,1,1,0,...,1,0,0,0,1,1,1,1,1,1
NAOC,sabotage/theft,Third party (undetermined),132,132,132,132,0,132,132,132,132,0,...,132,3,54,54,74,132,128,130,55,55
SPDC,company's fault,Third party (undetermined),4,4,4,4,4,4,4,4,4,4,...,4,0,3,3,3,4,4,3,2,2
SPDC,sabotage/theft,Third party (undetermined),174,167,174,174,174,174,174,174,174,174,...,174,4,40,40,124,174,169,158,92,92


In [435]:
df[df.cause_amnesty == "Third party (theft)"].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Third party (theft),2,2,2,2,0,2,2,2,2,0,...,2,0,2,2,2,2,2,2,1,1
NAOC,sabotage/theft,Third party (theft),393,393,393,393,0,393,393,393,393,0,...,393,4,262,262,391,393,387,392,384,384
SPDC,sabotage/theft,Third party (theft),377,372,377,377,377,377,377,377,377,377,...,377,7,73,73,355,377,364,360,336,336


#### * compare "Equipment failure" to cause reported by Shell/Eni

In [438]:
df.cause_incident_caused_by.value_counts().head(10)

Third party interference                  405
 Third party interference; Crude theft    217
Third party interference                  213
 Third party interference                 210
 Third party interference; Sabotage       171
 Operational                              134
Equipment failure                          54
Missing JIV                                21
Corrosion                                  14
 Third party interference; crude theft     12
Name: cause_incident_caused_by, dtype: int64

In [439]:
df[df.cause_incident_caused_by == "Equipment failure"].groupby(["company", "cause_mapped", "cause_amnesty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,oil_spill_id,jiv_number,date_reported,year,date_jiv_shell,facility_equipment,terrain,cause,barrels,cleanup_status_text,...,damage_photo,damage_photo_followup,comment_jiv_duplicate,comment_jiv_text,inches,facility_type,location,facility_name,facility_start,facility_end
company,cause_mapped,cause_amnesty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
NAOC,company's fault,Operational,51,51,51,51,0,51,51,51,51,0,...,51,48,46,46,20,51,50,41,20,20
NAOC,company's fault,Third party (theft),2,2,2,2,0,2,2,2,2,0,...,2,0,2,2,2,2,2,2,1,1
NAOC,sabotage/theft,Operational,1,1,1,1,0,1,1,1,1,0,...,1,1,1,1,0,1,1,0,0,0


#### * process "leak Point"'s text

In [444]:
leak_points = set()
for i in range(df.shape[0]):
    lp = df.leak_point_text.iloc[i]
    if type(lp) != float:
        for point in lp.split(";"):
            leak_points.add(point.strip())

leak_points

{'',
 'Accidental 3rd party equipment impact',
 'Acid',
 'Blasting',
 'Bulging outward',
 'Complete Rupture',
 'Corrosion',
 'Corrupt JIV',
 'Crude oil theft/ illegal bunkering',
 'Drilled hole',
 'Drilling',
 'Explosive tear',
 'Failed clamp',
 'Failed weld on illegal hot tap valve',
 'Hack saw cut',
 'Inward dent',
 'Missing JIV',
 'Missing pipeline/ flowline',
 'Missing pipeline/flowline',
 'Other',
 'Others',
 'Outward dent',
 'Photo',
 'Saver pit over flow',
 'Sawing',
 'Tear',
 'Third party tampering with clamp',
 'Third party tampering with flange',
 'Third party tampering with valve',
 'Valve failure',
 'Well head tampering'}

In [445]:
df[["leak_point_text", "photo_url"]].to_csv("./data/leak_points.csv")