# Prepare

## imports

In [1]:
%load_ext autoreload
%autoreload 2

import warnings

import numpy as np
import pandas as pd
import os
import sys

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.float_format', lambda x: '%.5f' % x)

np.random.seed(0)

In [2]:
module_path = os.path.abspath(os.path.join('../src'))
print("Adding modules", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

Adding modules /Users/christopherlomeli/Source/courses/datascience/data_science_capstone02/nfl_capstone/src


In [3]:
from src.features.wrangling.database_loader import DatabaseLoader
from src.features.wrangling.get_metrics import GetMetrics, get_versioned_name, conform_column_names

# setup

In [4]:
RAW_DATA_PATH = '../data/raw'
INTERIM_DATA_PATH='../data/interim'

# inputs
INPUT_DATA=os.path.join(RAW_DATA_PATH,"nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")

#outputs
OUTPUT_DATA = os.path.join(INTERIM_DATA_PATH,"nflplaybyplay2009to2016_reviewed_01.parquet")
BACKUP_METRICS_FILE = os.path.join(INTERIM_DATA_PATH,"metrics_backup.parquet")
DIMENSIONS_DATA = os.path.join(INTERIM_DATA_PATH,"dimensions.parquet")

# database
METRICS_TABLE_NAME="nfl_metrics"
CATEGORIES_TABLE_NAME="nfl_categories"

# verify
print("INPUT_DATA", INPUT_DATA)
print("OUTPUT_DATA", OUTPUT_DATA )
print("BACKUP_METRICS_FILE", BACKUP_METRICS_FILE )
print("", DIMENSIONS_DATA)
print("DIMENSIONS_DATA", METRICS_TABLE_NAME)
print("CATEGORIES_TABLE_NAME", CATEGORIES_TABLE_NAME)


INPUT_DATA ../data/raw/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv
OUTPUT_DATA ../data/interim/nflplaybyplay2009to2016_reviewed_01.parquet
BACKUP_METRICS_FILE ../data/interim/metrics_backup.parquet
 ../data/interim/dimensions.parquet
DIMENSIONS_DATA nfl_metrics
CATEGORIES_TABLE_NAME nfl_categories


In [5]:
db = DatabaseLoader(connection_string_env_url="DB_CONNECTION_URL")

In [6]:
# rdf = db.read_table("nfl_dim")
# rdf
# rdf.to_parquet(DIMENSIONS_DATA, engine='fastparquet',  compression='snappy')

In [7]:
# Creating the Series
sr = pd.Series(['Mike', 'Alessa', 'Nick', 'Kim', 'Britney'])

# Creating the index
idx = ['Name 1', 'Name 2', 'Name 3', 'Name 4', 'Name 5']

# set the index
sr.index = idx

# Print the series
sr




Name 1       Mike
Name 2     Alessa
Name 3       Nick
Name 4        Kim
Name 5    Britney
dtype: object

In [8]:
# extract groups having any capital letter
# followed by 'i' and any other character
result = sr.str.extract(pat = '([A-Z]i.)')

result

Unnamed: 0,0
Name 1,Mik
Name 2,
Name 3,Nic
Name 4,Kim
Name 5,


In [9]:
# Import needed package
import pycodestyle

# Create a StyleGuide instance
style_checker = pycodestyle.StyleGuide()

In [11]:
import boto3

# key = "nfl_capstone"
AWS_S3_BUCKET = "cjl-project-data"
#
# s3_client = boto3.client()

import boto3
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(AWS_S3_BUCKET)

for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object)

s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/.DS_Store')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/external/.gitkeep')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/.gitkeep')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/README.02-cjl-clean.txt')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/README.03-cjl-clean.txt')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/READ_ME.01-cjl-review.txt')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/admin_events_cleaned_01.parquet')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/analytic_events_cleaned_01.parquet')
s3.ObjectSummary(bucket_name='cjl-project-data', key='nfl_capstone/data/interim/gameplay_dimensions_cleaned_01.p

In [666]:
response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key="files/books.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    books_df = pd.read_csv(response.get("Body"))
    print(books_df)
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
