In [None]:
import numpy as np

In [None]:
x = np.random.normal(loc=0, scale=1, size=50) # mean, var and n of elements

In [None]:
x.mean()

In [None]:
y = x + np.random.normal(loc=50, scale=1, size=50)

In [None]:
np.corrcoef(x,y)

In [None]:
x.ndim # attribute

In [None]:
rng = np.random.default_rng(1303) # setting seed
print(rng.normal(scale=5, size=2))
rng2 = np.random.default_rng(1303)
print(rng2.normal(scale=5, size=2))

In [None]:
rng = np.random.default_rng(3)
y = rng.standard_normal(10)
np.mean(y), y.mean()

In [None]:
np.var(y), y.var(), np.mean((y - y.mean())**2)

In [None]:
# ddof argument -> variance is computed dividing by n, not n-1

In [None]:
# np.var?

In [None]:
np.sqrt(np.var(y)), np.std(y)

In [None]:
# 10 x 3 matrix of N(0,1) random variables

In [None]:
X = rng.standard_normal((10, 3))
X

In [None]:
X.sum(axis=0) # rows, as arrays are row-major ordered

In [None]:
X.mean(0)

In [None]:
# Note that in matplotlib, the word axes is not the plural of axis: a
# plot’s axes contains much more information than just the x-axis and the
# y-axis

In [None]:
from matplotlib.pyplot import subplots
fig, ax = subplots(figsize=(8, 8))
x = rng.standard_normal(100)
y = rng.standard_normal(100)
ax.plot(x, y); # the ; prevents matplotlib to print text to the notebook

In [None]:
fig, ax = subplots(figsize=(8, 8))
ax.plot(x, y, 'o'); # scatterplot

In [None]:
# Alternative
fig, ax = subplots(figsize=(8, 8))
ax.scatter(x, y, marker='o')
ax.set_xlabel("this is the x-axis")
ax.set_ylabel("this is the y-axis")
ax.set_title("Plot of X vs Y");

In [None]:
fig.set_size_inches(12,3)
fig

In [None]:
fig, axes = subplots(nrows=2,
                     ncols=3,
                     figsize=(15, 5))
# If graphs share x-axis -> sharex=True

In [None]:
axes[0,1].plot(x, y, 'o')
axes[1,2].scatter(x, y, marker='+')
fig

In [None]:
# dpi determines pixels
# fig.savefig("Figure.png", dpi=400)
# fig.savefig("Figure.pdf", dpi=200);

In [None]:
axes[0,1].set_xlim([-1,1])

In [None]:
# contour plot in order to represent three-dimensional data

In [None]:
fig, ax = subplots(figsize=(8, 8))
x = np.linspace(-np.pi, np.pi, 50) # returns a vector of 50 numbers in between a and b
y=x

f = np.multiply.outer(np.cos(y), 1 / (1 + x**2)) # f is a matrix whose elements correspond to the z value
# (the third dimension) for each pair of (x,y) coordinates

ax.contour(x, y, f, levels=45);

In [None]:
fig, ax = subplots(figsize=(8, 8))
ax.imshow(f);
# color-coded plot whose colors depend on the z value -> heatmap

In [None]:
seq2 = np.arange(0, 10) # 10 is not included

In [None]:
A = np.array(np.arange(16)).reshape((4, 4))

In [None]:
A[2,2]
A[[1,3]] # get 2nd and third row

In [None]:
A[:,[0,2]] # ":" selects all rows

In [None]:
A[[1,3],[0,2]] # as in R -> we get A[1,0] and A[3,2]

In [None]:
# Extracting a submatrix
A[[1,3]][:,[0,2]]

In [None]:
# Better alternative
idx = np.ix_([1,3],[0,2,3])
A[idx]

In [None]:
A[1:4:2,0:3:2] # subset matrices using slices

In [None]:
keep_rows = np.zeros(A.shape[0], bool)
keep_rows # vector of 0’s, represented as Booleans, of length equal to the first dimension of A -> we are considering rows (0)

In [None]:
keep_rows[[1,3]] = True
keep_rows

In [None]:
np.all(keep_rows == np.array([0,1,0,1]))
# np.any can be used to check whether ANY entries of an array are True

In [None]:
A[np.array([0,1,0,1])], A[np.array([0,1,0,1])]
# They are equal but different when indexing.

In [None]:
keep_cols = np.zeros(A.shape[1], bool)
keep_cols[[0, 2, 3]] = True
idx_bool = np.ix_(keep_rows, keep_cols)
A[idx_bool]

In [None]:
idx_mixed = np.ix_([1,3], keep_cols)
A[idx_mixed]

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Auto.csv')

In [None]:
Auto = pd.read_csv('Auto.data',
                   delim_whitespace=True,
                  na_values=['?']) # whitespace delimited version of the data

In [None]:
np.unique(Auto['horsepower']) # values as strings bc NaN is written as "?" -> change importing statement

In [None]:
Auto['horsepower'].sum()

In [None]:
auto_new = Auto.dropna()

In [None]:
Auto[:3] # first three rows

In [None]:
idx_80 = Auto['year'] > 80
Auto.loc[idx_80, ['weight', 'origin']] # array of Booleans used to subset rows

In [None]:
Auto.loc[lambda df: df['year']>80, ['weight', 'origin']]

In [None]:
Auto.loc[lambda df: (df['year'] > 80) & (df['mpg'] > 30),
            ['weight', 'origin']
            ]

In [None]:
Auto.loc[lambda df: (df['displacement'] < 300)
    & (df.index.str.contains('ford')
       | df.index.str.contains('datsun')),
    ['weight', 'origin']
    ]

In [None]:
Auto[['mpg', 'horsepower']]

In [None]:
Auto = Auto.set_index('name') # "name" is no longer in Auto.columns 

In [None]:
rows = ['amc rebel sst', 'ford torino']
Auto.loc[rows] # using a filter

In [None]:
Auto.iloc[[3,4]]
Auto.iloc[:,[0,2,3]]
Auto.iloc[[3,4],[0,2,3]]

In [None]:
Auto.loc['ford galaxie 500', ['mpg', 'origin']]

In [None]:
total = 0
for value, weight in zip([2,3,19],
                         [0.2,0.3,0.5]):
    total += weight * value
print('Weighted average is: {0}'.format(total))

In [None]:
# Create data frame whose percentage of missing values = 20%

rng = np.random.default_rng(1)
A = rng.standard_normal((127, 5))
M = rng.choice([0, np.nan], p=[0.8,0.2], size=A.shape)
A += M
D = pd.DataFrame(A, columns=[
    'food',
    'bar',
    'pickle',
    'snack',
    'popcorn'])
D[:3]

In [None]:
for col in D.columns:
    template = 'Column "{0}" has {1:.2%} missing values'
    # the second argument should be expressed as a % with two decimal digits
    print(template.format(col,
                np.isnan(D[col]).mean()))

In [None]:
ax = Auto.plot.scatter('horsepower', 'mpg');
ax.set_title('Horsepower vs. MPG')
fig = ax.figure
fig.savefig('horsepower_mpg.png');

In [None]:
fig, axes = subplots(ncols=3, figsize=(15, 5))
Auto.plot.scatter('horsepower', 'mpg', ax=axes[1]);

In [None]:
Auto.cylinders.dtype

In [None]:
# Transform into a categorical vrb
Auto.cylinders = pd.Series(Auto.cylinders, dtype='category')

In [None]:
fig, ax = subplots(figsize=(8, 8))
Auto.boxplot('mpg', by='cylinders', ax=ax);

In [None]:
fig, ax = subplots(figsize=(8, 8))
Auto.hist('mpg', color='red', bins=30, ax=ax);

In [None]:
# function to create a scatterplot matrix to visualize all of the pairwise relationships between the columns in a df
pd.plotting.scatter_matrix(Auto);

In [None]:
pd.plotting.scatter_matrix(Auto[['mpg',
                                 'displacement',
                                 'weight']]);

In [None]:
Auto[['mpg', 'weight']].describe()

In [None]:
credit = pd.read_csv('Credit.csv')

In [None]:
# Clear collinearity between limit and rating is spot -> still better to compute VIF
pd.plotting.scatter_matrix(credit[['Income',
                                   'Limit',
                                   'Rating',
                                   'Cards',
                                   'Age']]);