# 2D Data

- NumPy and SciPy provide a comprehensive means to work with 2D data.
- Pandas has the class DataFrame specifically to handle 2D labeled data.

In [3]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [4]:
# creating a 2D NumPy array
a = np.array([[1, 1, 1],
              [2, 3, 1],
              [4, 9, 2],
              [8, 27, 4],
              [16, 1, 1]])

In [5]:

np.mean(a)

a.mean()

np.median(a)

a.var(ddof=1)



53.40000000000001

In [7]:
# The functions and methods you’ve used so far have one optional parameter called axis
#    axis=None says to calculate the statistics across all data in the array. The examples above work like this. This behavior is often the default in NumPy.
#    axis=0 says to calculate the statistics across all rows, that is, for each column of the array. This behavior is often the default for SciPy statistical functions.
#    axis=1 says to calculate the statistics across all columns, that is, for each row of the array

# axis = 1 : results for each column

np.mean(a, axis=0)

a.mean(axis=0)

# axis = 1 : results for each row

np.mean(a, axis=1)

a.mean(axis=1)

# axis works the same way with other NumPy methods

np.median(a, axis=0)

np.median(a, axis=1)

a.var(axis=0, ddof=1)

a.var(axis=1, ddof=1)

array([ 1.,  2.,  5., 13.,  6.])

In [None]:
# SciPy statistics functions. 
# For SciPy the default value for axis is 0:

scipy.stats.gmean(a)  # Default: axis=0

scipy.stats.gmean(a, axis=0)

scipy.stats.gmean(a, axis=1)

scipy.stats.gmean(a, axis=None)

# for 2D data with scipy.stats.describe() :
#  It works similar to 1D arrays
#  axis=None : summary across all data.

scipy.stats.describe(a, axis=None, ddof=1, bias=False)

scipy.stats.describe(a, ddof=1, bias=False)  # Default: axis=0

scipy.stats.describe(a, axis=1, ddof=1, bias=False)

result = scipy.stats.describe(a, axis=1, ddof=1, bias=False)
result.mean


## DataFrames

- class DataFrame is one of the fundamental Pandas data types. 
- It’s very comfortable to work with because it has labels for rows and columns

In [8]:
row_names = ['first', 'second', 'third', 'fourth', 'fifth']
col_names = ['A', 'B', 'C']
df = pd.DataFrame(a, index=row_names, columns=col_names)

df

df.mean()

df.var()

# axis=1 : result for each row
# axis=0 : result for each row

df.mean(axis=1)

df.var(axis=1)

# The labels 'first', 'second', and so on refer to the different rows.
# You can isolate each column of a DataFrame like this:
df['A']

df['A'].mean()

df['A'].var()

# Convert dataframes to NumPy : 
# df.values and df.to_numpy() give you a 
# NumPy array with all items from the DataFrame 
# without row and column labels
df.values

df.to_numpy()

#  DataFrame objects have the method .describe()
#  The summary contains the following results:
#
#    count: the number of items in each column
#    mean: the mean of each column
#    std: the standard deviation
#    min and max: the minimum and maximum values
#    25%, 50%, and 75%: the percentiles

df.describe()

# access each item of the summary:
df.describe().at['mean', 'A']

df.describe().at['50%', 'B']


3.0