In [1]:
import os
import pandas as pd
import numpy as np
import time 
import pwd
import re
import io
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from scipy import stats
from scipy.stats import shapiro
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import string
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import math
import plotly.graph_objects as go
import plotly.express as px
import warnings
import boto3
import s3fs
from tqdm import tqdm_notebook as tqdm
from IPython.display import Markdown, display
import ipywidgets as widgets
from ipywidgets import FileUpload
from ipywidgets import TwoByTwoLayout
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import VBox, HBox, Label, Box
from ipywidgets import Button, Layout

#Print function
def printmd(string):
    display(Markdown(string))
np.set_printoptions(threshold=100000)
warnings.filterwarnings('ignore')

#Set the stop words
stop_words = set(stopwords.words("english"))

#Initialise the words corpus
words_corpora = set(nltk.corpus.words.words())
np.set_printoptions(threshold=sys.maxsize)

#Initialise the date words
stop_words_time = ['date', 'time', 'year', 'hour', 'o\'clock']

# Administrative metadata

##### Administrative metadata is information to help manage a resource, like resource type, permissions, and when and how it was created.

In [2]:
def administrative_metadata(req_file_path, file_name):
    #Check for access
    read_access = os.access(file_path, os.R_OK)
    write_access = os.access(file_path, os.W_OK)
    execution_access = os.access(file_path, os.X_OK)
    existance_file = os.access(file_path, os.F_OK)    
    
    #Create dataframe
    administrative_data = pd.DataFrame(columns = ['file_name', 'file_read_access', 'file_write_access', 'file_execution_access', 'file_existance', 'file_creation_date'])

    #Set values
    administrative_data.loc[0, 'file_name'] = file_name
    administrative_data.loc[0, 'file_read_access'] = read_access
    administrative_data.loc[0, 'file_write_access'] = write_access
    administrative_data.loc[0, 'file_execution_access'] = execution_access
    administrative_data.loc[0, 'file_existance'] = existance_file
    administrative_data.loc[0, 'file_creation_date'] = time.ctime(st.st_ctime)
    
    return administrative_data
    

# Descriptive metadata

##### Descriptive metadata is descriptive information about a resource. It is used for discovery and identification. It includes elements such as title, abstract, author, and keywords.

In [3]:
#Filter function to filter out time columns and columns that provide little information
def filter_columns(req_string): 
    pattern  = re.compile(r'' + "|".join(stop_words_time), re.IGNORECASE)
    is_string_column = lambda x : data[x].dtypes == 'O' or data[x].dtypes == 'S'
    return not bool(pattern.search(req_string)) and is_string_column(req_string)

#Check if a column is about time/dates
def is_time_column(req_string):
    pattern  = re.compile(r'' + "|".join(stop_words_time), re.IGNORECASE)
    return bool(pattern.search(req_string))

#Check if the words in the columns appear in the English corpus 
def is_meaningful_word(req_string):
    is_meaningful = pd.Series(list(map(lambda x: "True" if ((x != 'nan' and pd.isnull(x) != True) and str(x).lower().translate(str.maketrans('', '', string.punctuation)) in words_corpora) else "False", " ".join(map(str,req_string)).split()))).value_counts()
    return 'True' in is_meaningful.index.values and is_meaningful['True'] > 0

def data_keywords(req_data):
    column_names = req_data.columns
    df_nlp = pd.DataFrame()
    keywords = list()
    for column in filter(filter_columns, column_names):
        #Tokenize and lower case
        df_nlp[column] = req_data[column].apply(lambda x : str(x).lower().split(" "))
        vectorizer = TfidfVectorizer(stop_words="english", lowercase = False, ngram_range = (1,2))
        try:
            #Create the tfidf matrix 
            tfidf_matrix = vectorizer.fit_transform(df_nlp[column].astype(str).tolist())
            
            #Get the tokens
            feature_names = vectorizer.get_feature_names()
            
            #Get the words that appear
            feature_indexes = tfidf_matrix[:,:].nonzero()[1]
            
            #Create a DataFrame to store the tokens with their tfidf
            data_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names())
            
            #Sort the values in descending order
            data_tfidf = data_tfidf.apply(lambda x : x.mean()).sort_values(ascending=False)            
            if len(data_tfidf) >= 3:
                #Test the 
                stat, p = shapiro(data_tfidf)
                alpha = 0.05
                if p <= alpha and is_meaningful_word(req_data[column]):
                    keywords.append((data_tfidf[data_tfidf > data_tfidf.describe()['max'] - data_tfidf.describe()['std']].index.tolist()))
        except ValueError:
            pass
    for column in column_names:
        if is_time_column(column):
            keywords.append(["time analysis"])
            break;
    return keywords

In [4]:
def descriptive_metadata(req_file_path, req_data, req_file_name):
    st = os.stat(req_file_path)
    descriptive_data = pd.DataFrame(columns = ['file_name', 'file_author'])
    author_name = pwd.getpwuid(st.st_uid).pw_gecos
    descriptive_data.loc[0, 'file_name'] = req_file_name
    descriptive_data.loc[0, 'file_author'] = author_name
    keywords = data_keywords(req_data)
    descriptive_data.loc[0, 'keywords'] = "\n".join(map(str, keywords))  
    return descriptive_data

# Structural

###### Structural metadata is metadata about containers of data and indicates how compound objects are put together, for example, how pages are ordered to form chapters. It describes the types, versions, relationships and other characteristics of digital materials.

In [8]:
def structural_metadata(req_file_name, req_data):
    #Create a DataFrame to store the structural metadata
    structural_data = pd.DataFrame(columns = ['column_types'])
    col_type = list()
    for column in req_data.columns:
        col_type.append(type(req_data[column][0]))
    col_type = set(col_type)
    
    #Get the information
    structural_data.loc[0, 'file_name'] = req_file_name
    structural_data.loc[0, 'column_types'] = ",".join(map(str,col_type))
    structural_data.loc[0, 'column_numbers'] = len(data.columns)
    structural_data.loc[0, 'entries'] = data.shape[0]
    return structural_data

# Choose the file to get the metadata

# Local metadata file

Try to get a metadata file. 

In [5]:
try:
    meta_data = pd.read_csv('../data_meta.csv')
    index_file = meta_data.shape[0] - 1
except OSError:
    meta_data = pd.DataFrame(columns=['file_name', 'file_read_access', 'file_write_access',
       'file_execution_access', 'file_existance', 'file_creation_date',
       'file_author', 'keywords', 'column_types', 'column_numbers', 'entries'])

# Single file - local 

## Input the file location 

In [None]:
file_name = "Animal rescue incidents attended by LFB   London Datastore.csv"
file_path =  "../Data/London/Animal rescue incidents attended by LFB   London Datastore/" + file_name
st = os.stat(file_path)
data = pd.read_csv(file_path)

## Select the file using File explorer

In [6]:
upload = FileUpload(accept='.csv')
upload

FileUpload(value={}, accept='.csv', description='Upload')

In [None]:
file_name = next(iter(upload.value))
content = upload.value[file_name]['content']
file_path = "../Data/London/" + file_name[:file_name.rfind(".")] + "/" + file_name
data = pd.read_csv(io.BytesIO(content), header=0, escapechar='\\', encoding= 'unicode_escape')
st = os.stat(file_path)

# Get the metadata for the selected file

In [None]:
administrative_data = administrative_metadata(file_path, file_name)
descriptive_data = descriptive_metadata(file_path, data, file_name)
structural_data = structural_metadata(file_name ,data)
mt_data = pd.DataFrame()
mt_data = pd.merge(administrative_data, descriptive_data)
mt_data = pd.merge(mt_data, structural_data)
meta_data = meta_data.append(mt_data)

# Multiple files - one single local metadata file


For multiple files to get the metadata file

In [9]:
path_data = "../Data_Proto/London"
folders = os.listdir("../Data_Proto/London")
if ".DS_Store" in folders:
    folders.remove(".DS_Store")

for current_folder in tqdm(folders, desc = "Folders", leave = True):
    current_files = os.listdir(path_data + "/" + current_folder)
    for current_file_read in current_files:
        if current_file_read.endswith('.csv'):
            file_path = path_data + "/" + current_folder + "/" + current_file_read
            st = os.stat(file_path)
            try:
                data = pd.read_csv(file_path)          
            except UnicodeDecodeError:
                data = pd.read_csv(file_path, encoding ='latin-1')
            data = data.dropna(axis = 0, thresh = data.shape[1] / 2)
            data = data.reset_index(drop=True)
            administrative_data = administrative_metadata(file_path, current_file_read)
            if data.shape[0] > 0:
                descriptive_data = descriptive_metadata(file_path, data, current_file_read)
                structural_data = structural_metadata(current_file_read ,data)
            else:
                pass
            mt_data = pd.DataFrame()
            mt_data = pd.merge(descriptive_data, structural_data)
            mt_data = pd.merge(mt_data, administrative_data)
            meta_data = meta_data.append(mt_data)
            
printmd("**Finished generating metadata.**")

HBox(children=(IntProgress(value=0, description='Folders', max=22, style=ProgressStyle(description_width='init…





**Finished generating metadata.**

# Write the metadata to the local file

In [10]:
meta_data = meta_data.reset_index()
meta_data = meta_data.drop(["index"],axis=1)
meta_data.to_csv('../data_meta.csv', index=True)

# Write the metadata file to S3

In [11]:
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

## List all buckets available

In [12]:
def list_all_buckets():
    try:
        bucket_list = [bucket.name for bucket in s3.buckets.all()]
        return bucket_list
    except Exception as e:
        print("Unexpected error: %s", e)

print('Available buckets: ' + str(list_all_buckets()))

Available buckets: ['gabrielroscabucket', 'testbucketgabe', 'uom.bioinformatics']


## List all files in a bucket

In [13]:
def list_all_buckets(req_bucket_name = "", req_bucket = None): 
    try:
        folders_bucket = list()
        objects_list = list()
        if req_bucket_name != "":
            my_bucket = s3.Bucket(req_bucket_name)
        elif req_bucket != None:
            my_bucket = req_bucket
        else:
            print("Please provide an argument")
            return None
        for object in my_bucket.objects.all():
            if(re.match(r".*\..*", object.key)):    
                object_name = object.key[object.key.rfind("/") + 1:]
                objects_list.append(object_name)
            else: 
                folders_bucket.append(object.key)
        return objects_list, folders_bucket
    except Exception as e:
        print("Unexpected error: %s" % e)
        
mybucket = s3.Bucket("uom.bioinformatics") 
bucket_files, bucket_folders = list_all_buckets(req_bucket = mybucket)

printmd("**Files in bucket " + mybucket.name + ":** \n")
for value in bucket_files:
    print(value)
    print()

**Files in bucket uom.bioinformatics:** 


OA2_PM25_2013.csv

PM2.5%20summary%20FINAL_v2.pdf

Food_Establishment_London.csv

Archives.csv

Artists_workspaces.csv

Arts_centres.csv

Cinemas.csv

Commercial_galleries.csv

Community_centres.csv

Creative_Enterprise_Zones.gpkg

Creative_coworking_desk_space.csv

Creative_workspaces.csv

CulturalInfrastructureMap.gpkg

Dance_performance_venues.csv

Dance_rehearsal_studios.csv

Fashion_and_design.csv

Jewellery_design.csv

LGBT_night_time_venues.csv

Large_media_production_studios.csv

Legal_street_art_walls.csv

Libraries.csv

Live_in_artists_workspace.csv

Makerspaces.csv

Making_and_manufacturing.csv

Museums_and_public_galleries.csv

Music_office_based_businesses.csv

Music_recording_studios.csv

Music_rehearsal_studios.csv

Music_venues_all.csv

Music_venues_grassroots.csv

Outdoor_spaces_for_cultural_use.csv

Prop_and_costume_making.csv

Pubs.csv

Set_and_exhibition_building.csv

Skate_Parks.csv

Textile_design.csv

Theatre_rehearsal_studio.csv

Theatres.csv

site_by_borough.zi

## List all folders in a bucket

In [14]:
printmd("**Folders in bucket " + mybucket.name + ":**\n")
for value in bucket_folders:
    print(value)
    print()

**Folders in bucket uom.bioinformatics:**


Demo/

Download_Test/

LondonOpenData/

LondonOpenData/Health/

LondonOpenData/Health/Animal rescue incidents attended by LFB – London Datastore/

LondonOpenData/Health/Bariatric incidents attended by LFB – London Datastore/

LondonOpenData/Health/Breathe London AQMesh pods – London Datastore/

OpenPrescribing/

Transport/

UrbanObservatory/



## Write single metadata file to a bucket folder

In [None]:
s3_client.upload_file("../data_meta.csv", "uom.bioinformatics", "data_meta.csv")