# Welcome to Jupyter!

This repo contains an introduction to [Jupyter](https://jupyter.org) and [IPython](https://ipython.org).

Outline of some basics:

* [Notebook Basics](../examples/Notebook/Notebook%20Basics.ipynb)
* [IPython - beyond plain python](../examples/IPython%20Kernel/Beyond%20Plain%20Python.ipynb)
* [Markdown Cells](../examples/Notebook/Working%20With%20Markdown%20Cells.ipynb)
* [Rich Display System](../examples/IPython%20Kernel/Rich%20Output.ipynb)
* [Custom Display logic](../examples/IPython%20Kernel/Custom%20Display%20Logic.ipynb)
* [Running a Secure Public Notebook Server](../examples/Notebook/Running%20the%20Notebook%20Server.ipynb#Securing-the-notebook-server)
* [How Jupyter works](../examples/Notebook/Multiple%20Languages%2C%20Frontends.ipynb) to run code in different languages.

In [8]:
import pandas as pd

In [9]:
data = pd.read_csv("https://www.stats.govt.nz/assets/Uploads/Annual-enterprise-survey/Annual-enterprise-survey-2018-financial-year-provisional/Download-data/annual-enterprise-survey-2018-financial-year-provisional-csv.csv")

In [10]:
data.head()

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
0,2018,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,691859,ANZSIC06 divisions A-S (excluding classes K633...
1,2018,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,605766,ANZSIC06 divisions A-S (excluding classes K633...
2,2018,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,63509,ANZSIC06 divisions A-S (excluding classes K633...
3,2018,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,22583,ANZSIC06 divisions A-S (excluding classes K633...
4,2018,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,597623,ANZSIC06 divisions A-S (excluding classes K633...


In [11]:
data.shape

(27810, 10)

In [12]:
data.dtypes

Year                            int64
Industry_aggregation_NZSIOC    object
Industry_code_NZSIOC           object
Industry_name_NZSIOC           object
Units                          object
Variable_code                  object
Variable_name                  object
Variable_category              object
Value                          object
Industry_code_ANZSIC06         object
dtype: object

In [13]:
data[['Industry_aggregation_NZSIOC', 'Industry_name_NZSIOC']].head()

Unnamed: 0,Industry_aggregation_NZSIOC,Industry_name_NZSIOC
0,Level 1,All industries
1,Level 1,All industries
2,Level 1,All industries
3,Level 1,All industries
4,Level 1,All industries


In [14]:
data['Year'].value_counts()

2015    4635
2014    4635
2013    4635
2018    4635
2017    4635
2016    4635
Name: Year, dtype: int64

In [15]:
data['Year2'] = data['Year'] + 1

In [16]:
data['Year2'].value_counts()

2015    4635
2014    4635
2019    4635
2018    4635
2017    4635
2016    4635
Name: Year2, dtype: int64

In [17]:
data.head()

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06,Year2
0,2018,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,691859,ANZSIC06 divisions A-S (excluding classes K633...,2019
1,2018,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,605766,ANZSIC06 divisions A-S (excluding classes K633...,2019
2,2018,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,63509,ANZSIC06 divisions A-S (excluding classes K633...,2019
3,2018,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,22583,ANZSIC06 divisions A-S (excluding classes K633...,2019
4,2018,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,597623,ANZSIC06 divisions A-S (excluding classes K633...,2019


In [18]:
# filter
# transform
# aggregate
# descriptive
# plot

## <i>This is header</i>

In [19]:
# select Variable_code = H06

In [20]:
H06 = data.loc[data.Variable_code =="H06"]

In [21]:
H06.head()

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06,Year2
34,2018,Level 1,AA,"Agriculture, Forestry and Fishing",Dollars (millions),H06,"Government funding, grants and subsidies",Financial performance,12,ANZSIC06 division A,2019
305,2018,Level 3,AA21,Forestry and Logging,Dollars (millions),H06,"Government funding, grants and subsidies",Financial performance,0,ANZSIC06 group A030,2019
336,2018,Level 4,AA211,Forestry and Logging,Dollars (millions),H06,"Government funding, grants and subsidies",Financial performance,0,ANZSIC06 group A030,2019
397,2018,Level 3,AA32,"Agriculture, Forestry and Fishing Support Serv...",Dollars (millions),H06,"Government funding, grants and subsidies",Financial performance,11,"ANZSIC06 groups A042, A051, and A052",2019
428,2018,Level 1,BB,Mining,Dollars (millions),H06,"Government funding, grants and subsidies",Financial performance,2,ANZSIC06 division B,2019


In [22]:
H06.shape

(720, 11)

In [23]:
h06_2 = data[data['Variable_code']=='H06']

In [24]:
h06_3 = data[(data.Variable_code=='H06') | (data.Variable_code=='H07')]

In [25]:
h06_3.shape

(1554, 11)

In [26]:
# select Variable_code = or (H06, H01, H07, H05)

In [27]:
H_group = data[~data['Variable_code'].isin(['H06', 'H01', 'H07', 'H05'])]

In [28]:
H_group.head()

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06,Year2
1,2018,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,605766,ANZSIC06 divisions A-S (excluding classes K633...,2019
4,2018,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,597623,ANZSIC06 divisions A-S (excluding classes K633...,2019
5,2018,Level 1,99999,All industries,Dollars (millions),H09,Interest and donations,Financial performance,34223,ANZSIC06 divisions A-S (excluding classes K633...,2019
6,2018,Level 1,99999,All industries,Dollars (millions),H10,Indirect taxes,Financial performance,7124,ANZSIC06 divisions A-S (excluding classes K633...,2019
7,2018,Level 1,99999,All industries,Dollars (millions),H11,Depreciation,Financial performance,19863,ANZSIC06 divisions A-S (excluding classes K633...,2019


In [29]:
data.Industry_aggregation_NZSIOC.value_counts()

Level 4    15222
Level 3     9234
Level 1     3354
Name: Industry_aggregation_NZSIOC, dtype: int64

In [30]:
# Level 1 -> 1, Level 3 -> 2, Level 4 -> 3

In [31]:
# Variable_code2, False -> 0, True -> 1

In [32]:
def transform(row):
    if '1' in row:
        return 1
    elif (row=='Level 3'):
        return 2
    elif (row=='Level 4'):
        return 3
    else:
        return -1

In [33]:
data['H4'] = data.Industry_aggregation_NZSIOC.apply(lambda x: transform(x))

In [34]:
data.head()

Unnamed: 0,Year,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06,Year2,H4
0,2018,Level 1,99999,All industries,Dollars (millions),H01,Total income,Financial performance,691859,ANZSIC06 divisions A-S (excluding classes K633...,2019,1
1,2018,Level 1,99999,All industries,Dollars (millions),H04,"Sales, government funding, grants and subsidies",Financial performance,605766,ANZSIC06 divisions A-S (excluding classes K633...,2019,1
2,2018,Level 1,99999,All industries,Dollars (millions),H05,"Interest, dividends and donations",Financial performance,63509,ANZSIC06 divisions A-S (excluding classes K633...,2019,1
3,2018,Level 1,99999,All industries,Dollars (millions),H07,Non-operating income,Financial performance,22583,ANZSIC06 divisions A-S (excluding classes K633...,2019,1
4,2018,Level 1,99999,All industries,Dollars (millions),H08,Total expenditure,Financial performance,597623,ANZSIC06 divisions A-S (excluding classes K633...,2019,1


In [35]:
# filter
# transform
# aggregate
# descriptive
# plot

In [36]:
# tinh tong H4 tren tung level

In [37]:
data.groupby('Industry_aggregation_NZSIOC')['H4'].sum()

Industry_aggregation_NZSIOC
Level 1     3354
Level 3    18468
Level 4    45666
Name: H4, dtype: int64

In [38]:
data.groupby('Industry_aggregation_NZSIOC')['H4'].count()

Industry_aggregation_NZSIOC
Level 1     3354
Level 3     9234
Level 4    15222
Name: H4, dtype: int64

In [39]:
data.groupby('Industry_aggregation_NZSIOC')['H4'].mean()

Industry_aggregation_NZSIOC
Level 1    1
Level 3    2
Level 4    3
Name: H4, dtype: int64

In [40]:
import numpy as np

In [55]:
agg_data = data.groupby('Industry_aggregation_NZSIOC').agg({'H4': [pd.Series.sum, pd.Series.count, pd.Series.nunique],
                                                 'Industry_code_NZSIOC': pd.Series.nunique}).head()

Unnamed: 0_level_0,H4,H4,H4,Industry_code_NZSIOC
Unnamed: 0_level_1,sum,count,nunique,nunique
Industry_aggregation_NZSIOC,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Level 1,3354,3354,1,17
Level 3,18468,9234,1,46
Level 4,45666,15222,1,76


In [42]:
# descriptive
# plot

In [43]:
# Thống kê mô tả. 
# Dữ liệu dạng số 
data.describe(percentiles=[.1,0.25,0.5,0.75,0.9,0.95,0.99,1])

Unnamed: 0,Year,Year2,H4
count,27810.0,27810.0,27810.0
mean,2015.5,2016.5,2.426753
std,1.707856,1.707856,0.697037
min,2013.0,2014.0,1.0
10%,2013.0,2014.0,1.0
25%,2014.0,2015.0,2.0
50%,2015.5,2016.5,3.0
75%,2017.0,2018.0,3.0
90%,2018.0,2019.0,3.0
95%,2018.0,2019.0,3.0


In [44]:
# Dữ liệu dạng object 
data.describe(include='O')

Unnamed: 0,Industry_aggregation_NZSIOC,Industry_code_NZSIOC,Industry_name_NZSIOC,Units,Variable_code,Variable_name,Variable_category,Value,Industry_code_ANZSIC06
count,27810,27810,27810,27810,27810,27810,27810,27810,27810
unique,3,139,119,3,39,41,3,7492,121
top,Level 4,CC211,"Public Order, Safety and Regulatory Services",Dollars (millions),H01,Shareholders funds or owners equity,Financial performance,C,ANZSIC06 groups C151 and C152
freq,15222,216,496,21864,834,834,13902,1361,432


In [45]:
data.shape

(27810, 12)

In [46]:
agg_data.head()

Unnamed: 0_level_0,H4,H4,H4,Industry_code_NZSIOC
Unnamed: 0_level_1,sum,count,nunique,nunique
Industry_aggregation_NZSIOC,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Level 1,3354,3354,1,17
Level 3,18468,9234,1,46
Level 4,45666,15222,1,76


In [47]:
agg_data.reset_index().head()

Unnamed: 0_level_0,Industry_aggregation_NZSIOC,H4,H4,H4,Industry_code_NZSIOC
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,nunique,nunique
0,Level 1,3354,3354,1,17
1,Level 3,18468,9234,1,46
2,Level 4,45666,15222,1,76


In [48]:
%matplotlib inline

In [49]:
data.plot(x='Year', y='H4', kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x11ce1a6a0>

Error in callback <function flush_figures at 0x11ccf0ea0> (for post_execute):


KeyboardInterrupt: 

Unnamed: 0_level_0,H4,H4,H4,Industry_code_NZSIOC
Unnamed: 0_level_1,sum,count,nunique,nunique
Industry_aggregation_NZSIOC,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Level 1,3354,3354,1,17
Level 3,18468,9234,1,46
Level 4,45666,15222,1,76
