In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../../data/raw/team.csv',
               encoding='latin-1')
df

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB
...,...,...,...,...,...
294,49479,10190,898.0,FC St. Gallen,GAL
295,49837,10191,1715.0,FC Thun,THU
296,50201,9777,324.0,Servette FC,SER
297,50204,7730,1862.0,FC Lausanne-Sports,LAU


In [3]:
df.dtypes

id                    int64
team_api_id           int64
team_fifa_api_id    float64
team_long_name       object
team_short_name      object
dtype: object

In [4]:
df.describe()

Unnamed: 0,id,team_api_id,team_fifa_api_id
count,299.0,299.0,288.0
mean,23735.301003,12340.521739,21534.305556
std,15167.914719,25940.411135,42456.439408
min,1.0,1601.0,1.0
25%,9552.5,8349.0,178.75
50%,22805.0,8655.0,673.5
75%,36250.5,9886.5,1910.75
max,51606.0,274581.0,112513.0


In [5]:
simple_profile = df.describe(include='all')
simple_profile

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
count,299.0,299.0,288.0,299,299
unique,,,,296,259
top,,,,Widzew ÅÃ³dÅº,GEN
freq,,,,2,3
mean,23735.301003,12340.521739,21534.305556,,
std,15167.914719,25940.411135,42456.439408,,
min,1.0,1601.0,1.0,,
25%,9552.5,8349.0,178.75,,
50%,22805.0,8655.0,673.5,,
75%,36250.5,9886.5,1910.75,,


In [6]:
profile = ProfileReport(df, title="Export Data Profiling Report")

In [7]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [8]:
def get_basic_profile(dataframe):
    quantile_range = 0.5
    results = {}
    
    for column in dataframe.columns:
        count = df[column].count()
        nans = df[column].isna().sum()
        min = df[column].min()
        max = df[column].max()
        median = df[column].median()
        std = df[column].std()
        kurt = df[column].kurt()
        skew = df[column].skew()
        quant = df[column].quantile(q=quantile_range)
        
        results[column] = {'count': count,
                            'count_na': nans,
                            'min':min,
                            'max':max,
                            'median':median,
                            'std':std,
                            'kurt':kurt,
                            'skew':skew,
                            'quant {quantile_range}':quant}
 
    return results

In [9]:
num_columns = df.select_dtypes(include=['int64','float64'])
basic_profile = get_basic_profile(num_columns)
basic_profile

{'id': {'count': 299,
  'count_na': 0,
  'min': 1,
  'max': 51606,
  'median': 22805.0,
  'std': 15167.914719431232,
  'kurt': -1.2318159128376946,
  'skew': 0.028280637936977767,
  'quant {quantile_range}': 22805.0},
 'team_api_id': {'count': 299,
  'count_na': 0,
  'min': 1601,
  'max': 274581,
  'median': 8655.0,
  'std': 25940.411134771337,
  'kurt': 60.930003779617856,
  'skew': 7.651354132707231,
  'quant {quantile_range}': 8655.0},
 'team_fifa_api_id': {'count': 288,
  'count_na': 11,
  'min': 1.0,
  'max': 112513.0,
  'median': 673.5,
  'std': 42456.43940796823,
  'kurt': 0.5321796859619514,
  'skew': 1.583905668841215,
  'quant {quantile_range}': 673.5}}

In [10]:
# square root choice
bins = int(np.ceil(np.sqrt(len(num_columns['stage']))))
 
fig, ax = plt.subplots(1, 1, figsize=(14,8))
ax.hist(num_columns['stage'], bins)
 
ax.ticklabel_format(useOffset=False, style='plain')
ax.set_xlabel('stage')
ax.set_ylabel('Count')
ax.set_title(r'Matches per stage')
# plt.savefig("mygraph.jpg")

KeyError: 'stage'