# HDFS Usage Analysis

In [None]:
from __future__ import print_function
import json
import pprint
import pandas as pd
import numpy as np
from datetime import datetime, date, time
from os.path import basename, dirname, join as pjoin
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import re 
from collections import defaultdict
import humanize
import xml.etree.cElementTree as ET
%matplotlib inline
#print(plt.style.available)
plt.style.use('seaborn-dark')

In [None]:
# default block size in bytes for HDFS
BLOCK_SIZE = 128*1024.*1024.1024

## Simple Analysis

Quick overview on some of the properties of files in HDFS

### Data acquisition

This is based on taking a simple state of the HDFS filesystem listing all the files:

```
hdfs dfs -ls -R > hdfs.txt
```

The format of the file is
```
drwxr-xr-x   - user group            0 2015-09-04 15:56 /some/path/in/hdfs
```

In [None]:
def parse_file(filename):
    """Build a list of object for each file in the HDFS filesystem"""
    
    record_re = re.compile(r'(?P<permissions>\S+)\s+(?:\S+)\s+(?P<user>\S+)\s+(?P<group>\S+)\s+(?P<size>\d+)\s+(?P<date>[\d+-:\s]+)\s+(?P<path>\S+)')

    metadata = []

    # to keep track of the total size of directories
    directory_usage = defaultdict(int)
    directories = []
            
    with open(filename) as f:
        for line in f:
            m = record_re.match(line)
            o = m.groupdict()
            data = {
                    'user': o['user'],
                    'group': o['group'],
                    'size': int(o['size']),
                    'path': o['path'],
                    'date': datetime.strptime(o['date'], '%Y-%m-%d %H:%M'),
                    'permissions': o['permissions'],
                    'is_dir': o['permissions'][0] == 'd',
                    'is_userdir': o['path'].startswith('/user') and len(o['path'].split('/')) == 3,
                    'dir_size': 0
            }
            
            if o['permissions'][0] == 'd':
                # stash the directory for later once we'll have visited all the children
                directories.append(data)
            else:
                # add the size of the file to all upper directories
                path = '/'                
                for el in dirname(data['path']).split('/'):
                    path = pjoin(path, el)
                    directory_usage[path] += data['size']

                metadata.append(data)

    # add the directories and update its size
    for d in directories:
        d['dir_size'] = directory_usage[d['path']]
        metadata.append(d)
        
    return metadata

In [None]:
raw_data = parse_file('data/ch.hdfs')
data = json_normalize(raw_data)

In [None]:
data.head(2)

In [None]:
data.describe()

### General statistics

In [None]:
print('Statistics')
print('- %d files' % len(data))
print('- %s used' % humanize.naturalsize(data['size'].sum()))
print('- %d users' % len(data['user'].unique()))
print('- average file size:', humanize.naturalsize(data['size'].mean()))

### Usage by user (ownership of files)

In [None]:
ax1 = data.groupby('user')['size']\
    .sum()\
    .apply(lambda x:round(x/1024./1024/1024))\
    .sort_values(ascending=False)\
    .head(10)\
    .plot(kind='barh', title='Top HDFS usage by user ownership')
ax1.set_xlabel('Size in GB')
ax1.set_ylabel('')
ax1.invert_yaxis()

### Usage by path

In [None]:
ax2 = data[data['is_dir']][['path', 'dir_size']]\
    .set_index('path')['dir_size']\
    .apply(lambda x: round(x/1024./1024./1024.))\
    .sort_values(ascending=False)\
    .head(10)\
    .plot(kind='barh', title='Top HDFS usage by path')
ax2.set_xlabel('Size in GB')
ax2.set_ylabel('')
ax2.invert_yaxis()    

### Top usage for /user

In [None]:
ax = data[data['is_userdir']][['path', 'dir_size']]\
    .set_index('path')['dir_size']\
    .apply(lambda x: round(x/1024./1024./1024.))\
    .sort_values(ascending=False)\
    .head(10)\
    .plot(kind='barh', title='Top HDFS usage by path for /user')
ax.invert_yaxis()
ax.set_ylabel('')
ax.set_xlabel('Size in GB')

### Top user with small files

In [None]:
ax = data[(data['is_dir'] == False) & (data['size'] < BLOCK_SIZE)]\
    .groupby('user')\
    .size()\
    .sort_values(ascending=False)\
    .head(10)\
    .plot(kind='barh')
ax.set_xlabel('Number of files < HDFS block size')
ax.set_ylabel('')
ax.invert_yaxis()

### Average file size per user

In [None]:
ax = data[(data['is_dir'] == False)]\
    .groupby('user')['size']\
    .mean()\
    .sort_values(ascending=False)\
    .apply(lambda x: x/1024./1024.)\
    .plot(kind='barh')
ax.set_xlabel('Average file size per user (MB)')
ax.set_ylabel('')
ax.invert_yaxis()

### Top directories with biggest number of small files


In [None]:
temp = data[(data['is_dir'] == False) & (data['size'] < BLOCK_SIZE)]['path'].apply(lambda x: dirname(x))
ax = temp.groupby(temp).size()\
    .sort_values(ascending=False)\
    .head(10)\
    .plot(kind='barh')
ax.set_ylabel('')    
ax.set_xlabel('Number of small file (< HDFS Block Size)')
ax.invert_yaxis()

## Detailed analysis

### Data acquisition

This relies on the raw fsimage from HDFS:

```
hdfs dfsadmin -fetchimage fsImage
hdfs oiv -i fsImage -o fsImage.xml -p XML
```

This will produce an XML formatted version of the HDFS metadata

### Analysis

In [None]:
a = ET.parse('data/hdfs.xml')

In [None]:
root = a.getroot()
root.getchildren()

In [None]:
def parse_xml(filename):
    metadata = []
    cnt = 0
    for a in root.findall('INodeSection/inode'):
        o = {
                'inode_id': int(a.find('id').text),
                'type': a.find('type').text,
                'name': a.find('name').text
        }
        if o['type'] == 'FILE':
            o['replication'] = int(a.find('replication').text)
            o['perferredBlockSize'] = int(a.find('perferredBlockSize').text)       
        file_size, num_block = 0, 0
        if a.find('blocks'):
            for block in a.find('blocks').getchildren():
                file_size += int(block.find('numBytes').text)
                num_block += 1
            o['file_size'] = file_size
            o['num_block'] = num_block
        metadata.append(o)
    return metadata

In [None]:
data = json_normalize(parse_xml('data/hdfs.xml'))

In [None]:
data.head(2)

In [None]:
data.describe()

## Difference between 2 clusters

Naively rely on 2 extracts for each cluster using 

```
(cluster1) hdfs dfs -ls -R / > hdfs-1.txt
(cluster2) hdfs dfs -ls -R / > hdfs-2.txt
```

In [2]:
def analyze_diff(path=None):
    """
    tool to analyze differences between the 2 clusters
    
    If path is not None, the analysis is limited to the path given
    """
    missing_files = []
    different_files = []
    properties = ['user', 'size', 'group', 'permissions']

    for f in cluster_1:
        # we skip the file if it is not in the path we are interested into
        if path and not f['path'].startswith(path):
            continue
        if f['path'] not in index_2:
            missing_files.append(f['path'])
        else:
            # compare
            f2 = index_2[f['path']]
            diff = []        
            for p in properties: 
                try:
                    if f[p] != f2[p]:                
                        diff.append(p)
                except:
                    print(f)
                    print(f2)
            if diff:
                different_files.append({
                        'source': f,
                        'target': f2,
                        'diff': diff
                    })
    return (missing_files, different_files)

Load data for each cluster and build a quick index

In [None]:
cluster_1 = parse_file('data/hdfs-1.txt')
cluster_2 = parse_file('data/hdfs-2.txt')

# build an index by path
index_1 = {}
for f in cluster_1:
    index_1[f['path']] = f
index_2 = {}
for f in cluster_2:
    index_2[f['path']] = f

Run the analysis

In [None]:
missing_files, different_files = analyze_diff('/user/glinmac')

Display some output for missing files in cluster 2

In [None]:
print('Missing files', len(missing_files))
missing_dirs = []
for f in missing_files:
    d = dirname(f)
    if d not in missing_dirs:
        missing_dirs.append(d)
        print('\t%s' % d)
    print(f)

Display files that have different metadata properties

In [None]:
print('\nFiles with different properties', len(different_files))
for f in different_files:
    print('\t%s' % f['source']['path'])
    for p in f['diff']:
        print('\t\t%s: source=%s target=%s' %(p, f['source'][p], f['target'][p]))