In [1]:
import pandas as pd
from datetime import datetime

In [2]:
# List of files we want to process into single CSV data set
files = ['github.csv', 'google.csv', 'reuters.csv', 'wikipedia.csv', 'youtube.csv']
# List of features we want to extract per file
features = ['Length']
# Mapping of website names to integers to simplify output prediction for predictive models
siteMappings = {
    'github.csv': 0,
    'google.csv': 1,
    'reuters.csv': 2,
    'wikipedia.csv': 3,
    'youtube.csv': 4,
}

In [3]:
# Process and combine all the raw data files into singular sanitized CSV containing desired features

finalDataSet = pd.DataFrame()

# For every file that we have in raw_data subdirectory
for file in files:
    # Obtain the raw data for that particular data set
    raw = pd.read_csv('raw_data/{}'.format(file))

    # Create a new dataframe containing name of file as target
    tempDataSet = pd.DataFrame({'Website': ['{}'.format(file)] * len(raw)})

    # Extract the desired features from the raw dataset and add it to the dataframe
    for feature in features:
        tempDataSet = pd.concat([tempDataSet, raw[feature].to_frame()], axis=1)

    # Add the sanitized data set to the final data set
    finalDataSet = pd.concat([finalDataSet, tempDataSet]).reset_index(drop=True)

In [4]:
# Perform an integrity check to see if all observations were carried over

rowCountPassed = False
rowValuesPassed = True

#=================
# Row count
#=================
rowCount = 0

# Count
for file in files:
    rowCount += len(pd.read_csv('raw_data/{}'.format(file)))

if (len(finalDataSet) == rowCount):
    rowCountPassed = True

#=================
# Row values
#=================
# First, need to check if previous test passed
# Will not work otherwise
if (rowCountPassed):
    index = 0
    # Run through every row in finalDataSet and for each file
    for file in files:
        raw = pd.read_csv('raw_data/{}'.format(file))
        # We run through the raw data set and compare the values for each row to finalDataSet
        for subindex in range(len(raw)):
            # Check website name
            if (finalDataSet.iloc(0)[index].get('Website') != file):
                rowValuesPassed = False

            # Check features
            for feature in features:
                if (finalDataSet.iloc(0)[index].get(feature) != raw.iloc(0)[subindex].get(feature)):
                    rowValuesPassed = False
            index += 1

# Final reporting of integrity check status
if (rowCountPassed):
    print("GOOD: All observations accounted for")
else:
    print("BAD: Missing some observations")
if (rowValuesPassed):
    print("GOOD: All observation values correct")
else:
    print("BAD: Some observation values incorrect")

GOOD: All observations accounted for
GOOD: All observation values correct


In [5]:
# Save the raw sanitized data set
finalDataSet.to_csv("./sanitized_data/sanitized_data_{}_unmapped.csv".format(str(datetime.now()).replace(" ", "_").replace(".", "-").replace(":", "-")))

# Perform a mapping on the `Website` target column for ease of use in predictive models
finalDataSet['Website'] = finalDataSet['Website'].map(siteMappings)
# Save this as usable data set for models
finalDataSet.to_csv("./sanitized_data/sanitized_data_{}_mapped.csv".format(str(datetime.now()).replace(" ", "_").replace(".", "-").replace(":", "-")))