In [1]:
import os
import os.path

from urllib.parse import urlparse
from urllib.parse import parse_qs
from datetime import datetime

import numpy as np
import pandas as pd

# Data Challenge

Files are stored in an S3 bucket. The purpose here is to fully analyze the data and make some predictions.

This workbook was exported to a Python script and the resulting code was checked for PEP8 problems. Any problems were corrected.

## Bucket Contents and File Download

In [2]:
s3 = boto3.resource('s3')
bucket_name = "postie-testing-assets"

In [3]:
test = s3.Bucket(bucket_name)

In [4]:
s3.meta.client.head_bucket(Bucket=bucket_name)

{'ResponseMetadata': {'HTTPHeaders': {'content-type': 'application/xml',
   'date': 'Mon, 16 Oct 2017 18:06:53 GMT',
   'server': 'AmazonS3',
   'transfer-encoding': 'chunked',
   'x-amz-bucket-region': 'us-east-1',
   'x-amz-id-2': 'YhUEo61GDGSwz1qOpFGJl+C9Sxal34XKRYzOI0TF49PsSSGsbGg2Y6xwbf07z+KHIKusPIYkjxE=',
   'x-amz-request-id': 'DDD0C4B61BDF320E'},
  'HTTPStatusCode': 200,
  'HostId': 'YhUEo61GDGSwz1qOpFGJl+C9Sxal34XKRYzOI0TF49PsSSGsbGg2Y6xwbf07z+KHIKusPIYkjxE=',
  'RequestId': 'DDD0C4B61BDF320E',
  'RetryAttempts': 0}}

In [72]:
for key in test.objects.all():
    print(key.key)

2017-07-01.csv
2017-07-02.csv
2017-07-03.csv


An alternative is to download the data using

```aws s3 cp --recursive s3://my_bucket_name local_folder```


## Data Issues

1. Column names have spaces, so we need to remove them. A more sophisticated method would do this on import. However, processing the first row in this way may slow down the import process, particularly if the files are much larger. There are ways to read chunks via ```read_csv```, which can be used in a class to get the first line of the file, process it as a header, then continue reading the rest of the file in chunks. This is probably the best way to read many large files.
2. Placeholder is blank (NaN) for two files. But is this needed?
3. The file labeled "2017-07-01" has transactions for 7/1/2017 and 7/2/2017.
4. The file labeled "2017-07-02" has transactions for 7/2/2017 and 7/3/2017.
5. The file labeled "2017-07-03" has transactions only for 7/3/2017.
6. There are two website id's, but one website id has two separate domain names: store.example.com, and www.example.com. This affects counts and also reporting if using the domain name. **Handling the domain names is very dependent on this dataset - no effort was made to write a more general solution**.

## Observations

1. Analyst is correct - July 3rd *file* has sales of $164,065. There are fewer sales and customers on the 3rd than the other two days, so the sales amount is lower. However, there are problems with the data files, so there are sales recorded on the 3rd found in files labeled for the 2nd and 3rd. There are sales recorded on the 2nd found in the files labeled for the 1st and 2nd. To get accurate sales data we will have to group by transaction date - the sales date parsed from the *timestamp* field.
2. The *placeholder* field is empty for two files, so it's probably not meaningful. Regardless, it will not add anything to prediction.
3. *app_version* may or may not be meaningful. There is only one app_version for the files labeled "2017-07-01" and "2017-07-02". The remaining file has two app_versions - 1.1 and 1.2. This field is probably not meaningful for prediction, but sales on the 3rd should be examined for the different app_versions.

## Plans
- Get the items. But should we make new columns in the original dataset?
- Items are paired with item counts.
- We would like to know the sales by domain and by item.
- We would like to know the corrected sales by date.
- Average sales is not meaningful - explain why.
- Sales prediction at the aggregate level is not meaningful - there are only two values per domain.
- So do we predict sales at the item level? If so, how?
- What do we do with the *app_version*?
- What are the values in *placeholder*?
- Explain why average sales for a day is not meaningful.

In [120]:
fr = pd.DataFrame()
col_names = ["timestamp", "website_id", "customer_id", "app_version", "placeholder", "checkout_amount", "url"]
data_report = []
for fname in os.listdir("data"):
    ffr = pd.read_csv(os.path.join("data", fname),
                      header=0, names=col_names,
                      infer_datetime_format=True, parse_dates=[0])
    file_date = fname.split(".")[0]
    ffr["file_date"] = file_date
    transaction_date = ffr.timestamp.apply(lambda x: x.strftime('%Y-%m-%d'))  # reformat transaction timestamp
    ffr["transaction_date"] = transaction_date
    url_items = ffr.url.apply(lambda x: urlparse(x))
    
    domain_items = ffr.url.str.split("//")  # assumption is we always have a "valid" url
    domain_name = domain_items.apply(lambda x: x[1].split("/")[0])
    ffr["domain_name"] = domain_name.apply(lambda x: x if not "example.com" in x else ".".join(x.split(".")[1:]))
    item_names = domain_items.apply(lambda x: x[1].split("?")[1])  # see comments above
    
    print("date {} has {} sales for rows {} and unique dates {}".format(fname, ffr.checkout_amount.sum(),
                                                                        ffr.shape[0],
                                                                        transaction_date.unique().shape[0]))
    data_report.append({"file_date": file_date, "sales": ffr.checkout_amount.sum(),
                        "n_placeholder_nan": sum(ffr.placeholder.isnull()),
                        "n_rows": ffr.shape[0],
                        "n_websites": ffr.website_id.unique().shape[0],
                        "n_customers": ffr.customer_id.unique().shape[0],
                        "n_app_versions": ffr.app_version.unique().shape[0],
                        "n_dates": transaction_date.unique().shape[0]})
    fr = fr.append(ffr)
fr.reset_index(drop=True, inplace=True)
fr.shape

date 2017-07-02.csv has 183294.0 sales for rows 11573 and unique dates 2
date 2017-07-01.csv has 241491.0 sales for rows 11634 and unique dates 2
date 2017-07-03.csv has 164065.0 sales for rows 9981 and unique dates 1


(33188, 10)

In [3]:
pd.DataFrame(data_report)

Unnamed: 0,file_date,n_app_versions,n_customers,n_dates,n_placeholder_nan,n_rows,n_websites,sales
0,2017-07-02,1,8198,2,11573,11573,2,183294.0
1,2017-07-01,1,8226,2,11634,11634,2,241491.0
2,2017-07-03,2,7412,1,1792,9981,2,164065.0


In [119]:
fr.head()

Unnamed: 0,timestamp,website_id,customer_id,app_version,placeholder,checkout_amount,url,file_date,transaction_date,domain_name
0,2017-07-02 07:00:35,123,9418,1.1,,6.0,http://www.example.com/store/?Ume=1,2017-07-02,2017-07-02,example.com
1,2017-07-02 07:00:37,124,3872,1.1,,3.0,http://xyz.com/checkout?Prairie+Potato=1,2017-07-02,2017-07-02,xyz.com
2,2017-07-02 07:00:47,123,3090,1.1,,4.0,http://www.example.com/store/?Hazelnut=1,2017-07-02,2017-07-02,example.com
3,2017-07-02 00:00:51,124,9556,1.1,,6.0,http://xyz.com/checkout?Ume=1,2017-07-02,2017-07-02,xyz.com
4,2017-07-02 00:01:02,124,8845,1.1,,6.0,http://xyz.com/checkout?Bignay=1,2017-07-02,2017-07-02,xyz.com


In [5]:
pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=True)

Unnamed: 0_level_0,sum,sum,sum
domain_name,example.com,xyz.com,All
transaction_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2017-07-01,172874.0,50641.0,223515.0
2017-07-02,127786.0,55966.0,183752.0
2017-07-03,64599.0,116984.0,181583.0
All,365259.0,223591.0,588850.0


In [6]:
pd.pivot_table(fr, values="checkout_amount", index="file_date", columns="domain_name",
               aggfunc=[np.sum], margins=True)

Unnamed: 0_level_0,sum,sum,sum
domain_name,example.com,xyz.com,All
file_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2017-07-01,182100.0,59391.0,241491.0
2017-07-02,127466.0,55828.0,183294.0
2017-07-03,55693.0,108372.0,164065.0
All,365259.0,223591.0,588850.0


In [113]:
def convert_list(x):
    def handle_error(z, col):
        if "error" in col:
            return 1 if "True" in z else 0
        else:
            return z
    dd = parse_qs(x)
    return {k: int(handle_error(dd[k][0], k)) for k in dd}

In [114]:
url_results = fr.url.apply(lambda x: urlparse(x))
domain_names = url_results.apply(lambda x: x[1])
item_query = url_results.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list(x)).apply(pd.Series).fillna(value=0)

In [78]:
dd = parse_qs(item_query[20])
ddd = {k: int(dd[k][0]) for k in dd}
ddd
# for k in dd:
#     dd[k] = int(dd[k][0])

{'Round Kumquat': 1, 'Ume': 1}

In [115]:
qq

Unnamed: 0,Bignay,Black/White Pepper,European Grape,Hazelnut,Mabolo,Natal Orange,Prairie Potato,Round Kumquat,Ume,Ylang-ylang,error
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [95]:
for it in item_query:
    if "True" in it:
        print(it)

Bignay=1&error=True


In [11]:
url_results.shape

(33188,)

In [116]:
url_results = pd.concat([fr.timestamp, fr.customer_id, qq], axis=1)

In [117]:
url_results.head(10)

Unnamed: 0,timestamp,customer_id,Bignay,Black/White Pepper,European Grape,Hazelnut,Mabolo,Natal Orange,Prairie Potato,Round Kumquat,Ume,Ylang-ylang,error
0,2017-07-02 07:00:35,9418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2017-07-02 07:00:37,3872,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2017-07-02 07:00:47,3090,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-07-02 00:00:51,9556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2017-07-02 00:01:02,8845,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2017-07-02 00:01:14,6008,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2017-07-02 00:01:18,5312,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,2017-07-02 07:01:33,9178,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2017-07-02 07:01:55,1680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,2017-07-02 07:01:55,275,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
int(url_results.url[0]["Ume"][0])

1

In [30]:
url_results["url"].apply(pd.Series) #.apply(lambda x: int(0 if np.isnan(x[0]) else x[0])) #.fillna(value=0) #.apply(lambda x: int(x[0]))

Unnamed: 0,Bignay,Black/White Pepper,European Grape,Hazelnut,Mabolo,Natal Orange,Prairie Potato,Round Kumquat,Ume,Ylang-ylang,error
0,,,,,,,,,[1],,
1,,,,,,,[1],,,,
2,,,,[1],,,,,,,
3,,,,,,,,,[1],,
4,[1],,,,,,,,,,
5,,[1],,,,,,,,,
6,,,,,,[1],,,,,
7,[1],,,,,,,,,,
8,,,,,,,,,[1],,
9,,,,,,,[1],,,,


In [34]:
new_cols = url_results["url"].apply(pd.Series).fillna(value=0)

In [56]:
new_cols.head()

Unnamed: 0,Bignay,Black/White Pepper,European Grape,Hazelnut,Mabolo,Natal Orange,Prairie Potato,Round Kumquat,Ume,Ylang-ylang,error
0,0,0,0,0,0,0,0,0,[1],0,0
1,0,0,0,0,0,0,[1],0,0,0,0
2,0,0,0,[1],0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,[1],0,0
4,[1],0,0,0,0,0,0,0,0,0,0


In [49]:
new_cols.shape

(33188, 11)

In [57]:
for col in new_cols.columns:
    print(new_cols[col].unique())

TypeError: unhashable type: 'list'

In [44]:
int(new_cols["Bignay"][4][0])

1

In [53]:
def convert_to_num(x):
    if isinstance(x, list):
        try:
            xx = int(x[0])
        except Exception as ex:
            print(x[0])
        return int(x[0])
    else:
        return x

In [54]:
frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]

True


ValueError: invalid literal for int() with base 10: 'True'