In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../../data/raw/possession.csv',
               encoding='latin-1')
df

Unnamed: 0,match_id,id,subtype,event_incident_typefk,elapsed,elapsed_plus,del,homepos,awaypos,injury_time,goal_type,card_type
0,489042,379029,possession,352,25,,,56.0,44.0,,,
1,489042,379251,possession,352,45,1.0,,54.0,46.0,,,
2,489042,379443,possession,352,70,,,54.0,46.0,,,
3,489042,379575,possession,352,90,5.0,,55.0,45.0,,,
4,489043,375608,possession,352,27,,,65.0,35.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
34810,2030170,4947481,possession,352,85,,,45.0,55.0,,,
34811,2030171,4940420,possession,352,22,,,59.0,41.0,,,
34812,2030171,4940639,possession,352,45,,,61.0,39.0,,,
34813,2030171,4940941,possession,352,68,,,60.0,40.0,,,


In [3]:
df.dtypes

match_id                   int64
id                         int64
subtype                   object
event_incident_typefk      int64
elapsed                    int64
elapsed_plus             float64
del                      float64
homepos                  float64
awaypos                  float64
injury_time              float64
goal_type                 object
card_type                 object
dtype: object

In [4]:
pd.set_option('display.float_format',lambda x: '%.2f' % x)

In [5]:
df.describe()

Unnamed: 0,match_id,id,event_incident_typefk,elapsed,elapsed_plus,del,homepos,awaypos,injury_time
count,34815.0,34815.0,34815.0,34815.0,6320.0,17.0,34796.0,34796.0,2.0
mean,1383210.04,3084967.57,352.0,55.39,2.03,1.0,51.81,48.19,47.0
std,548690.26,1757282.14,0.0,24.92,1.43,0.0,10.16,10.16,5.66
min,489042.0,375608.0,352.0,4.0,1.0,1.0,2.0,6.0,43.0
25%,875496.0,1374959.5,352.0,30.0,1.0,1.0,45.0,42.0,45.0
50%,1475009.0,3320984.0,352.0,45.0,1.0,1.0,52.0,48.0,47.0
75%,1983548.0,4800273.5,352.0,75.0,3.0,1.0,58.0,55.0,49.0
max,2118418.0,5623586.0,352.0,90.0,16.0,1.0,94.0,98.0,51.0


In [6]:
simple_profile = df.describe(include='all')
simple_profile

Unnamed: 0,match_id,id,subtype,event_incident_typefk,elapsed,elapsed_plus,del,homepos,awaypos,injury_time,goal_type,card_type
count,34815.0,34815.0,34815,34815.0,34815.0,6320.0,17.0,34796.0,34796.0,2.0,1,1
unique,,,1,,,,,,,,1,1
top,,,possession,,,,,,,,dg,y
freq,,,34815,,,,,,,,1,1
mean,1383210.04,3084967.57,,352.0,55.39,2.03,1.0,51.81,48.19,47.0,,
std,548690.26,1757282.14,,0.0,24.92,1.43,0.0,10.16,10.16,5.66,,
min,489042.0,375608.0,,352.0,4.0,1.0,1.0,2.0,6.0,43.0,,
25%,875496.0,1374959.5,,352.0,30.0,1.0,1.0,45.0,42.0,45.0,,
50%,1475009.0,3320984.0,,352.0,45.0,1.0,1.0,52.0,48.0,47.0,,
75%,1983548.0,4800273.5,,352.0,75.0,3.0,1.0,58.0,55.0,49.0,,


In [7]:
profile = ProfileReport(df, title="Export Data Profiling Report")

In [8]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [9]:
def get_basic_profile(dataframe):
    quantile_range = 0.5
    results = {}
    
    for column in dataframe.columns:
        count = df[column].count()
        nans = df[column].isna().sum()
        min = df[column].min()
        max = df[column].max()
        median = df[column].median()
        std = df[column].std()
        kurt = df[column].kurt()
        skew = df[column].skew()
        quant = df[column].quantile(q=quantile_range)
        
        results[column] = {'count': count,
                            'count_na': nans,
                            'min':min,
                            'max':max,
                            'median':median,
                            'std':std,
                            'kurt':kurt,
                            'skew':skew,
                            'quant {quantile_range}':quant}
 
    return results

In [10]:
num_columns = df.select_dtypes(include=['int64','float64'])
basic_profile = get_basic_profile(num_columns)
basic_profile

{'match_id': {'count': 34815,
  'count_na': 0,
  'min': 489042,
  'max': 2118418,
  'median': 1475009.0,
  'std': 548690.2568460202,
  'kurt': -1.527520055577049,
  'skew': -0.21873765241064536,
  'quant {quantile_range}': 1475009.0},
 'id': {'count': 34815,
  'count_na': 0,
  'min': 375608,
  'max': 5623586,
  'median': 3320984.0,
  'std': 1757282.1364704191,
  'kurt': -1.5508960974212835,
  'skew': -0.04082735291821031,
  'quant {quantile_range}': 3320984.0},
 'event_incident_typefk': {'count': 34815,
  'count_na': 0,
  'min': 352,
  'max': 352,
  'median': 352.0,
  'std': 0.0,
  'kurt': 0,
  'skew': 0,
  'quant {quantile_range}': 352.0},
 'elapsed': {'count': 34815,
  'count_na': 0,
  'min': 4,
  'max': 90,
  'median': 45.0,
  'std': 24.923775056060077,
  'kurt': -1.3430747331121242,
  'skew': 0.01754971855744891,
  'quant {quantile_range}': 45.0},
 'elapsed_plus': {'count': 6320,
  'count_na': 28495,
  'min': 1.0,
  'max': 16.0,
  'median': 1.0,
  'std': 1.429001596853891,
  'kurt'

In [11]:
# square root choice
bins = int(np.ceil(np.sqrt(len(num_columns['height']))))
 
fig, ax = plt.subplots(1, 1, figsize=(14,8))
ax.hist(num_columns['height'], bins)
 
ax.ticklabel_format(useOffset=False, style='plain')
ax.set_xlabel('Height')
ax.set_ylabel('Count')
ax.set_title(r'Height count')
# plt.savefig("mygraph.jpg")

KeyError: 'height'