In [1]:
# Dependencies
import pandas as pd
import numpy as np
import os

# Data Cleaning

In [38]:
# Path of the CSV file
csvfile = "data/masterfile.csv"
statefile = "data/stateabbreviations.csv"

In [39]:
# Read CSV file into a pandas DataFrame
df = pd.read_csv(csvfile, dtype=object)
states = pd.read_csv(statefile, dtype=object)

In [4]:
# Preview DataFrame
# Note that some rows are missing gender
df.head()

Unnamed: 0,state,year,productionRateBracketBOEperday,classNumber,oilWells,percentageOilWells,oilWellsAnnualOilProductionMMbbl,oilWellsPercentageofOilProduction,oilWellsOilRatePerWellBBLperDay,oilWellsAnnualGasProdBCF,...,gasWellsPercentageofGasWells,gasWellsAnnualGasProdBCF,gasWellsPercentageofGasProduction,gasWellsGasRatePerWellMCFperDay,gasWellsAnnualOilProdMMbbl,gasWellsOilRatePerWellBBLperDay,totalNumberofWells,totalWellsAnnualOilProdMMbbl,totalWellsAnnualProduction,horizontalWellCount
0,AK,2016,A_ 0 - 1,1,10,0.48,0.001,0.0,0.267,0.0,...,5.01,0.01,0.0,2.175,0.0,0.0,27,0.001,0.01,0
1,AK,2016,B_ 1 - 2,2,6,0.29,0.002,0.0,1.215,0.003,...,0.88,0.008,0.0,10.949,0.0,0.0,9,0.002,0.011,0
2,AK,2016,C_ 2 - 4,3,10,0.48,0.006,0.0,2.785,0.006,...,2.65,0.043,0.01,17.449,0.001,0.359,19,0.007,0.049,0
3,AK,2016,D_ 4 - 6,4,11,0.53,0.013,0.01,3.998,0.018,...,2.06,0.061,0.02,27.434,0.001,0.283,18,0.013,0.08,1
4,AK,2016,E_ 6 - 8,5,12,0.57,0.02,0.01,5.756,0.023,...,0.88,0.044,0.01,39.673,0.001,0.47,15,0.021,0.067,0


In [5]:
# Use `dropna` to drop any rows where there is missing data
# Notice that index has not been reset and contains a gap (0,2,5,6,7)
df = df.dropna(axis=0)
df.head()

Unnamed: 0,state,year,productionRateBracketBOEperday,classNumber,oilWells,percentageOilWells,oilWellsAnnualOilProductionMMbbl,oilWellsPercentageofOilProduction,oilWellsOilRatePerWellBBLperDay,oilWellsAnnualGasProdBCF,...,gasWellsPercentageofGasWells,gasWellsAnnualGasProdBCF,gasWellsPercentageofGasProduction,gasWellsGasRatePerWellMCFperDay,gasWellsAnnualOilProdMMbbl,gasWellsOilRatePerWellBBLperDay,totalNumberofWells,totalWellsAnnualOilProdMMbbl,totalWellsAnnualProduction,horizontalWellCount
0,AK,2016,A_ 0 - 1,1,10,0.48,0.001,0.0,0.267,0.0,...,5.01,0.01,0.0,2.175,0.0,0.0,27,0.001,0.01,0
1,AK,2016,B_ 1 - 2,2,6,0.29,0.002,0.0,1.215,0.003,...,0.88,0.008,0.0,10.949,0.0,0.0,9,0.002,0.011,0
2,AK,2016,C_ 2 - 4,3,10,0.48,0.006,0.0,2.785,0.006,...,2.65,0.043,0.01,17.449,0.001,0.359,19,0.007,0.049,0
3,AK,2016,D_ 4 - 6,4,11,0.53,0.013,0.01,3.998,0.018,...,2.06,0.061,0.02,27.434,0.001,0.283,18,0.013,0.08,1
4,AK,2016,E_ 6 - 8,5,12,0.57,0.02,0.01,5.756,0.023,...,0.88,0.044,0.01,39.673,0.001,0.47,15,0.021,0.067,0


In [6]:
# Save the cleaned data to a file called `customers_cleaned.csv`
new_csv = "wells_cleaned.csv"
df.to_csv(new_csv, index=False)

# Database Creation

In [1]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, MetaData
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Numeric, Text, Float

In [2]:
# Create an engine to a SQLite database file called `wells.sqlite`
engine = create_engine("sqlite:///wells.sqlite")

In [3]:
Base = automap_base()

In [4]:
Base.prepare(engine, reflect=True)

In [5]:
Base.classes.keys()

[]

In [6]:
!ls *.sqlite

wells.sqlite


In [7]:
engine.table_names()

[]

In [8]:
# Fire up the inspector
from sqlalchemy import inspect
inspector = inspect(engine)

In [9]:
inspector.get_table_names()

[]

In [10]:
well = Base.classes.wells

AttributeError: wells

In [17]:
# Create a connection to the engine called `conn`
conn = engine.connect()

In [40]:
# Use `declarative_base` from SQLAlchemy to model the demographics table as an ORM class
# Make sure to specify types for each column, e.g. Integer, Text, etc.
# http://docs.sqlalchemy.org/en/latest/core/type_basics.html
Base = declarative_base()

class Well(Base):
    __tablename__ = 'wells'

    id = Column(Integer, primary_key=True)
    state = Column(Text)
    year = Column(Integer)
    productionRateBracketBOEperday = Column(Text)
    classNumber = Column(Float)
    oilWells = Column(Integer)
    percentageOilWells = Column(Float)
    oilWellsAnnualOilProductionMMbbl = Column(Float)
    oilWellsPercentageofOilProduction = Column(Float)
    oilWellsOilRatePerWellBBLperDay  = Column(Float)
    oilWellsAnnualGasProdBCF = Column(Float)
    oilWellsGasRatePerWellMCFperDay  = Column(Float)
    numberofGasWells = Column(Integer)
    gasWellsPercentageofGasWells= Column(Float)
    gasWellsAnnualGasProdBCF = Column(Float)
    gasWellsPercentageofGasProduction = Column(Float)
    gasWellsGasRatePerWellMCFperDay = Column(Float)
    gasWellsAnnualOilProdMMbbl = Column(Float)
    gasWellsOilRatePerWellBBLperDay = Column(Float)
    totalNumberofWells = Column(Integer)
    totalWellsAnnualOilProdMMbbl  = Column(Float)
    totalWellsAnnualProduction = Column(Float)
    horizontalWellCount = Column(Integer)
   
    def __repr__(self):
        return f"id={self.id}, name={self.state}"
    
class State(Base):
    __tablename__ = 'abbreviations'

    id = Column(Integer, primary_key=True)
    abbreviation = Column(Text)
    state = Column(Text)
    
    def __repr__(self):
        return f"id={self.id}, name={self.state}"
# More on __repr__: https://stackoverflow.com/questions/1984162/purpose-of-pythons-repr    

In [41]:
# Use `create_all` to create the customers table in the database
Base.metadata.create_all(engine)

In [20]:
# Load the cleaned csv file into a pandas dataframe
new_df = pd.read_csv(new_csv)

In [21]:
# Use Orient='records' to create a list of data to write
# to_dict() cleans out DataFrame metadata as well
# http://pandas-docs.github.io/pandas-docs-travis/io.html#orient-options
data = new_df.to_dict(orient='records')

In [43]:
statedata = states.to_dict(orient='records')

In [22]:
# Data is just a list of dictionaries that represent each row of data
print(data[:5])

[{'state': 'AK', 'year': 2016, 'productionRateBracketBOEperday': 'A_  0 - 1', 'classNumber': 1.0, 'oilWells': 10, 'percentageOilWells': 0.48, 'oilWellsAnnualOilProductionMMbbl': 0.001, 'oilWellsPercentageofOilProduction': 0.0, 'oilWellsOilRatePerWellBBLperDay': 0.267, 'oilWellsAnnualGasProdBCF': 0.0, 'oilWellsGasRatePerWellMCFperDay': 0.0, 'numberofGasWells': 17, 'gasWellsPercentageofGasWells': 5.01, 'gasWellsAnnualGasProdBCF': 0.01, 'gasWellsPercentageofGasProduction': 0.0, 'gasWellsGasRatePerWellMCFperDay': 2.175, 'gasWellsAnnualOilProdMMbbl': 0.0, 'gasWellsOilRatePerWellBBLperDay': 0.0, 'totalNumberofWells': 27, 'totalWellsAnnualOilProdMMbbl': 0.001, 'totalWellsAnnualProduction': 0.01, 'horizontalWellCount': 0}, {'state': 'AK', 'year': 2016, 'productionRateBracketBOEperday': 'B_  1 - 2', 'classNumber': 2.0, 'oilWells': 6, 'percentageOilWells': 0.29, 'oilWellsAnnualOilProductionMMbbl': 0.002, 'oilWellsPercentageofOilProduction': 0.0, 'oilWellsOilRatePerWellBBLperDay': 1.215, 'oilWell

In [23]:
# Use MetaData from SQLAlchemy to reflect the tables
metadata = MetaData(bind=engine)
metadata.reflect()

In [24]:
# Save the reference to the `customers` table as a variable called `table`
table = sqlalchemy.Table('wells', metadata, autoload=True)

In [42]:
stateTable = sqlalchemy.Table('abbreviations', metadata, autoload=True)

In [25]:
# Use `table.delete()` to remove any pre-existing data.
# Note that this is a convenience function so that you can re-run the example code multiple times.
# You would not likely do this step in production.
conn.execute(table.delete())

<sqlalchemy.engine.result.ResultProxy at 0x22576bcbfd0>

In [26]:
# Use `table.insert()` to insert the data into the table
# The SQL table is populated during this step
conn.execute(table.insert(), data)

<sqlalchemy.engine.result.ResultProxy at 0x22576f77128>

In [44]:
conn.execute(stateTable.insert(), data)

<sqlalchemy.engine.result.ResultProxy at 0x22577c58d30>

In [27]:
# Test that the insert works by fetching the first 5 rows. 
conn.execute("select * from wells limit 5").fetchall()

[(1, 'AK', 2016, 'A_  0 - 1', 1.0, 10, 0.48, 0.001, 0.0, 0.267, 0.0, 0.0, 17, 5.01, 0.01, 0.0, 2.175, 0.0, 0.0, 27, 0.001, 0.01, 0),
 (2, 'AK', 2016, 'B_  1 - 2', 2.0, 6, 0.29, 0.002, 0.0, 1.215, 0.003, 2.285, 3, 0.88, 0.008, 0.0, 10.949000000000002, 0.0, 0.0, 9, 0.002, 0.011000000000000001, 0),
 (3, 'AK', 2016, 'C_  2 - 4', 3.0, 10, 0.48, 0.006, 0.0, 2.785, 0.006, 2.491, 9, 2.65, 0.043, 0.01, 17.449, 0.001, 0.359, 19, 0.006999999999999999, 0.049, 0),
 (4, 'AK', 2016, 'D_  4 - 6', 4.0, 11, 0.53, 0.013000000000000001, 0.01, 3.998, 0.018000000000000002, 5.86, 7, 2.06, 0.061, 0.02, 27.434, 0.001, 0.28300000000000003, 18, 0.013000000000000001, 0.08, 1),
 (5, 'AK', 2016, 'E_  6 - 8', 5.0, 12, 0.57, 0.02, 0.01, 5.756, 0.023, 6.586, 3, 0.88, 0.044000000000000004, 0.01, 39.673, 0.001, 0.47, 15, 0.021, 0.067, 0)]

In [28]:
inspector.get_columns('wells')

[{'autoincrement': 'auto',
  'default': None,
  'name': 'id',
  'nullable': False,
  'primary_key': 1,
  'type': INTEGER()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'state',
  'nullable': True,
  'primary_key': 0,
  'type': TEXT()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'year',
  'nullable': True,
  'primary_key': 0,
  'type': INTEGER()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'productionRateBracketBOEperday',
  'nullable': True,
  'primary_key': 0,
  'type': TEXT()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'classNumber',
  'nullable': True,
  'primary_key': 0,
  'type': FLOAT()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'oilWells',
  'nullable': True,
  'primary_key': 0,
  'type': INTEGER()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'percentageOilWells',
  'nullable': True,
  'primary_key': 0,
  'type': FLOAT()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'oilWellsAnnualOilPro

In [45]:
inspector.get_columns('abbreviations')

[{'autoincrement': 'auto',
  'default': None,
  'name': 'id',
  'nullable': False,
  'primary_key': 1,
  'type': INTEGER()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'abbreviation',
  'nullable': True,
  'primary_key': 0,
  'type': TEXT()},
 {'autoincrement': 'auto',
  'default': None,
  'name': 'state',
  'nullable': True,
  'primary_key': 0,
  'type': TEXT()}]

In [51]:
sql = "select state, year, classNumber, horizontalWellCount from wells where classNumber = 23 and year = 2016"
df = pd.read_sql(sql, conn)
df.head()

Unnamed: 0,state,year,classNumber,horizontalWellCount
0,AK,2016,23.0,14
1,AL,2016,23.0,23
2,AR,2016,23.0,5585
3,AZ,2016,23.0,0
4,CA,2016,23.0,1697
