In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

In [2]:
module_path = os.path.abspath(os.path.join('..', 'eval_utils'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

from eval_utils import DataAnalysis

d:\CS\summer_project\comp47360-group8\ml_pipeline\data_evaluation\eval_utils


In [3]:
df = pd.read_csv('osm_places_amenity.csv')

threshold = int(0.03* df.shape[0])
df = df.dropna(axis=1, thresh=threshold)

  df = pd.read_csv('osm_places_amenity.csv')


In [4]:
da = DataAnalysis("osm_places_amenity", df)

Number of rows: 15055
Number of cols: 46



Column types:
id: int64
lat: float64
lon: float64
addr:city: object
addr:housenumber: object
addr:postcode: object
addr:state: object
addr:street: object
amenity: object
branch: object
brand: object
brand:wikidata: object
cuisine: object
drive_through: object
name: object
official_name: object
opening_hours: object
phone: object
takeaway: object
website: object
internet_access: object
outdoor_seating: object
cocktails: object
drink:beer: object
drink:liquor: object
drink:wine: object
wheelchair: object
check_date: object
payment:cash: object
payment:credit_cards: object
email: object
alt_name: object
ref: object
level: object
diet:vegan: object
diet:vegetarian: object
toilets: object
drink:coffee: object
drink:tea: object
smoking: object
bar: object
contact:instagram: object
delivery: object
drink:espresso: object
reservation: object
indoor_seating: object


 Convert column types to desired ones before continuing


In [5]:
da.analyse(plots=False)

Number of duplicate rows: 0
id 0
lat 0
lon 0
addr:city 10064
addr:housenumber 4921
addr:postcode 7185
addr:state 10422
addr:street 4798
amenity 0
branch 14310
brand 12919
brand:wikidata 12972
cuisine 5035
drive_through 14150
name 430
official_name 14572
opening_hours 8055
phone 7383
takeaway 10899
website 8199
internet_access 14236
outdoor_seating 12535
cocktails 14051
drink:beer 13626
drink:liquor 14218
drink:wine 13746
wheelchair 13172
check_date 14139
payment:cash 14397
payment:credit_cards 14481
email 14455
alt_name 14275
ref 14469
level 14076
diet:vegan 14571
diet:vegetarian 14438
toilets 14321
drink:coffee 14081
drink:tea 14577
smoking 14534
bar 14562
contact:instagram 14594
delivery 12911
drink:espresso 14536
reservation 14433
indoor_seating 14409


Value proportions:
addr:city
NaN               0.668482
New York          0.165593
Brooklyn          0.071936
Bronx             0.011757
Jamaica           0.010628
                    ...   
Richmond Hills    0.000066
Laurelton      

In [6]:
from docx import Document
import humanize

In [7]:
class DataQualityReport:
    critical_params = ['title', 'description', 'source', 'source_link', 'detailed_desc']
    def __init__(self, da: DataAnalysis, params: dict):
        self.da = da
        self.params = params
        self.formatter = ticker.EngFormatter()
        # make sure all critical parameters are present
        for param in self.critical_params:
            if param not in params:
                raise ValueError(f"The required value {param} is missing from parameters")
        
        # Get the columns that contain null values
        self.null_val_cols = []
        for col in da.num_null_vals:
            if da.num_null_vals[col] > 0:
                self.null_val_cols.append(col)

    def write_missing_vals_overview(self):

        if len(self.null_val_cols) == 0:
            return "The dataset contains no missing values. "
        
        if len(self.null_val_cols) == 1:
            return f"The feature {self.null_val_cols[0]} contains missing values. "

        if len(self.null_val_cols)>5:
            return f"{len(self.null_val_cols)} features contain missing values. "
        
        output = "The features "
        for col in self.null_val_cols:
            output += f"{col}, "
        
        output = output[:-2] + " contain missing values."

    def write_df_shape(self):
        return f"The dataset has {self.da.num_cols} features and {self.da.num_rows} rows. "
    
    def write_missing_values_summary(self):
        
        dropped_features = []
        for col in self.null_val_cols:
            if da.df_category_perc_missing.loc[col, '%missing']>50:
                dropped_features.append(col)
            
        if len(dropped_features) == 0:
            return "No features will have to be dropped due to missing values. "
        
        if len(dropped_features) < 5:
            output = ""
            for feature in dropped_features:
                output += f"{feature}, "
            output = output[:-2] + " will have to be dropped due to missing values. "
            return output
        
        return f"{len(dropped_features)} features will have to be dropped due to missing values. "

    def write_duplicate_rows(self):
        if(self.da.duplicate_count == 0):
            return "There are no duplicate rows. "

        return f"There are {self.da.duplicate_count} duplicate rows. "
    
    def numeric_col_description(self, col):
        output = f"This feature has a mean of {da.df_table_numeric.loc[col, 'mean']}, a min value of {da.df_table_numeric.loc[col, 'min']} and a max value of {da.df_table_numeric.loc[col, 'max']}. "
        if col in self.null_val_cols:
            output += f"There are {da.num_null_vals[col]} missing values. "
        
        return output

    def write_document(self):
        document = Document()

        document.add_heading(f'{self.params['title']} Data Quality Report', 1)
        document.add_heading('Overview', 2)

        p = document.add_paragraph(f'This report will outline the initial data quality findings on {self.params['description']} data obtained from {self.params['source']} which can be found at {self.params['source_link']}. ')
        p.add_run(f'This report will include an overview of the dataset, and a review of the continuous and categorical features, including histograms and bar charts. On initial review, this dataset contains a lot of missing data for most features. The data that is present appears to be reasonable and logical, however a number of columns will need to be dropped. ')
        mv = self.write_missing_vals_overview()
        p.add_run(mv)

        document.add_heading('Summary', 2)
        p = document.add_paragraph(f'This dataset consists of {self.params['detailed_desc']}. ')
        p.add_run(self.write_df_shape())
        p.add_run(self.write_missing_values_summary())
        p.add_run(self.write_duplicate_rows())
        p.add_run("Distribution of the data is consistent with expectations.")

        document.add_heading('Review Logical Integrity', 2)
        document.add_paragraph('Test 1: No date in x is before Feb 2022 or after the upload date (22nd May).')
        document.add_paragraph('0 instances.', style='List Bullet')

        if len(da.numeric_columns) > 0:
            document.add_heading("Review Continuous Features", 2)
            document.add_paragraph(f"There are {len(da.numeric_columns)} continuous features in this dataset:")
            for col in da.numeric_columns:
                document.add_paragraph(col, style="List Bullet")
                desc = self.numeric_col_description(col)
                document.add_paragraph(desc, style="List Bullet 2")
        

        document.save(f'{self.params['title']} Data Quality Report.docx')


In [8]:
import humanize

In [12]:
x = 0.00000001
print(humanize.intword(x))


0


In [10]:
params = {
    "title": "OSM Amenities1",
    "description": "amenities",
    "source": "Overpass Turbo API",
    "detailed_desc": "information on different Amenities in New York City, including bars, restaurants, and cafes",
    "source_link": "https://overpass-turbo.eu/"
}

dqr = DataQualityReport(da, params)
dqr.write_document()

PermissionError: [Errno 13] Permission denied: 'OSM Amenities1 Data Quality Report.docx'