In [1]:
print('Hello Notebook')

Hello Notebook


## numpy

In [2]:
# loading numpy library
import numpy as np

In [3]:
# generate random data
data = np.random.randn(3, 3)

In [4]:
data

array([[-0.39357902,  0.21154499,  0.8333766 ],
       [ 0.70055429,  1.00029865,  0.46875792],
       [ 1.35126932,  0.41636211, -0.86373404]])

In [5]:
# return a tuple of integers, showing the size 
# of the array in each dimension
# for matrix with n rows and m columns,the shape 
# will be (n, m)
data.shape

(3, 3)

In [6]:
# element-wise add
# a new array is created
data + data

array([[-0.78715803,  0.42308998,  1.66675321],
       [ 1.40110858,  2.0005973 ,  0.93751584],
       [ 2.70253863,  0.83272422, -1.72746808]])

In [7]:
# element-wise product
# a new array is created
data * data

array([[0.15490444, 0.04475128, 0.69451656],
       [0.49077631, 1.00059739, 0.21973399],
       [1.82592876, 0.17335741, 0.74603649]])

In [8]:
# matrix product
# a new array is created
# check if it is differnt from element-wise product
np.dot(data, data)

array([[ 1.42921943,  0.47533494, -0.94865189],
       [ 1.05845823,  1.34396918,  0.6478413 ],
       [-1.40728429,  0.34271459,  2.06732576]])

In [9]:
# element-wise divide
# a new array is created
1 / data

array([[-2.54078585,  4.72712682,  1.1999377 ],
       [ 1.42744112,  0.99970144,  2.1332973 ],
       [ 0.74004493,  2.40175554, -1.1577638 ]])

In [10]:
# a convenient noation for matrix transpose
# a new array is created
data.T

array([[-0.39357902,  0.70055429,  1.35126932],
       [ 0.21154499,  1.00029865,  0.41636211],
       [ 0.8333766 ,  0.46875792, -0.86373404]])

In [11]:
# verify if original data is changed after transpose
data

array([[-0.39357902,  0.21154499,  0.8333766 ],
       [ 0.70055429,  1.00029865,  0.46875792],
       [ 1.35126932,  0.41636211, -0.86373404]])

In [12]:
# another notation for matrix transpose
# the result is the same as calling data.T
np.transpose(data)

array([[-0.39357902,  0.70055429,  1.35126932],
       [ 0.21154499,  1.00029865,  0.41636211],
       [ 0.8333766 ,  0.46875792, -0.86373404]])

In [13]:
# more linear algebra
# inverse of a square matrix
# a new array is created
np.linalg.inv(data)

array([[ 5.17919415, -2.59019625,  3.5914345 ],
       [-6.05617405,  3.84427346, -3.75698999],
       [ 5.18322138, -2.19910624,  2.64980522]])

In [14]:
# math functions, such as sin, cos, and exp
# these functions operate in element-wise style
# a new array is created
np.exp(data)

array([[0.67463801, 1.23558555, 2.30107546],
       [2.01486922, 2.71909377, 1.5980081 ],
       [3.86232493, 1.51643488, 0.42158493]])

In [15]:
# basic indexing
# index starts from 0
# element at second row and third column
# a new array is created
data[1, 2]

0.46875791805470846

In [16]:
# slicing
# obtain a subset of array, this is similar to 
# slicing in Python list object
# rows from index 1 (included) to index 3 (excluded),
# columns from index 1 (included) to index 3 (excluded)
# a new array is created
data[1:3, 1:3]

array([[ 1.00029865,  0.46875792],
       [ 0.41636211, -0.86373404]])

In [17]:
# advanced indexing
# indexed by integer arrays
idx = np.array([0, 1])         

# select first and second rows of the array
data[idx]

array([[-0.39357902,  0.21154499,  0.8333766 ],
       [ 0.70055429,  1.00029865,  0.46875792]])

In [18]:
# select first and second columns of the array
data[:, idx]

array([[-0.39357902,  0.21154499],
       [ 0.70055429,  1.00029865],
       [ 1.35126932,  0.41636211]])

In [19]:
# advanced indexing
# indexed by boolean arrays
# set all the negative values in data to 0
data[data < 0] = 0

In [20]:
# check if data is changed:
# all values are greater or equal to 0
data

array([[0.        , 0.21154499, 0.8333766 ],
       [0.70055429, 1.00029865, 0.46875792],
       [1.35126932, 0.41636211, 0.        ]])

In [21]:
# functions for descriptive statistics,
# such as sum, mean, median, std, max, min
# for mean, default is to compute the mean of 
# the flattened array
np.mean(data)

0.5535737642404467

In [22]:
# axis along which the means are computed
# for 2-dimensional array, there are two axes:
# axis 0: running vertically downwards across rows,
# aixs 1: running horizontally across columns
np.mean(data, axis=1)

array([0.3483072 , 0.72320362, 0.58921047])

In [23]:
# reading text data

# StringIO behaves like a file object
from io import StringIO
c = StringIO("0 1\n2 3")

# np.loadtxt(fname,...)
# fname can be a file object or filename
np.loadtxt(c)

array([[0., 1.],
       [2., 3.]])

## pandas

In [24]:
# loading pandas library
import pandas as pd

In [25]:
# create a Series object from a Python list
obj1 = pd.Series([4, 7, -5, 3])

In [26]:
obj1

0    4
1    7
2   -5
3    3
dtype: int64

In [27]:
obj1.values

array([ 4,  7, -5,  3])

In [28]:
# an index object is immutable and is responsible for
# holding axis labels and other metadata
obj1.index

RangeIndex(start=0, stop=4, step=1)

In [29]:
# an index with labels
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [30]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [31]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [32]:
# indexing
obj2['a']

-5

In [33]:
# slicing
obj2[['a', 'b', 'c']]

a   -5
b    7
c    3
dtype: int64

In [34]:
# selection
obj2[obj2 < 5]

d    4
a   -5
c    3
dtype: int64

In [35]:
# create DataFrame from a Python dictionary
data = {'city': ['Beijing', 'Beijing', 'Beijing',
                  'Shanghai', 'Shanghai', 'Shanghai'], 
        'year': [1990, 2000, 2010, 1990, 2000, 2010],
        'population': [10.8, 13.6, 19.6, 13.3, 16.4, 23.0]}
frame = pd.DataFrame(data)

In [36]:
# show the first 5 rows
frame.head()

Unnamed: 0,city,year,population
0,Beijing,1990,10.8
1,Beijing,2000,13.6
2,Beijing,2010,19.6
3,Shanghai,1990,13.3
4,Shanghai,2000,16.4


In [37]:
frame['population']

0    10.8
1    13.6
2    19.6
3    13.3
4    16.4
5    23.0
Name: population, dtype: float64

In [38]:
frame.year

0    1990
1    2000
2    2010
3    1990
4    2000
5    2010
Name: year, dtype: int64

In [39]:
frame.columns

Index(['city', 'year', 'population'], dtype='object')

In [40]:
frame['country'] = 'China' 

In [41]:
frame

Unnamed: 0,city,year,population,country
0,Beijing,1990,10.8,China
1,Beijing,2000,13.6,China
2,Beijing,2010,19.6,China
3,Shanghai,1990,13.3,China
4,Shanghai,2000,16.4,China
5,Shanghai,2010,23.0,China


In [42]:
frame.values

array([['Beijing', 1990, 10.8, 'China'],
       ['Beijing', 2000, 13.6, 'China'],
       ['Beijing', 2010, 19.6, 'China'],
       ['Shanghai', 1990, 13.3, 'China'],
       ['Shanghai', 2000, 16.4, 'China'],
       ['Shanghai', 2010, 23.0, 'China']], dtype=object)

In [43]:
# sort values by a column
# return sorted DataFrame
frame.sort_values(by='population')

Unnamed: 0,city,year,population,country
0,Beijing,1990,10.8,China
3,Shanghai,1990,13.3,China
1,Beijing,2000,13.6,China
4,Shanghai,2000,16.4,China
2,Beijing,2010,19.6,China
5,Shanghai,2010,23.0,China


In [44]:
frame['population'].describe()

count     6.000000
mean     16.116667
std       4.519476
min      10.800000
25%      13.375000
50%      15.000000
75%      18.800000
max      23.000000
Name: population, dtype: float64

In [45]:
frame['population'].sum()

96.69999999999999

In [46]:
frame['population'].mean()

16.116666666666664

In [47]:
frame['population'].std()

4.519476370849467

In [48]:
obj3 = pd.Series([4, np.nan, -5, 3])

In [49]:
obj3

0    4.0
1    NaN
2   -5.0
3    3.0
dtype: float64

In [50]:
# fill NAs by the mean value
# fillna return a new Series
# assign it to obj3
obj3 = obj3.fillna(obj3.mean())

In [51]:
obj3

0    4.000000
1    0.666667
2   -5.000000
3    3.000000
dtype: float64

In [52]:
obj4 = pd.Series([1, 2, np.nan, 4])

In [53]:
# fill NAs by an interpolated Series
# fillna return a new Series
# assign it to obj4
obj4 = obj4.fillna(obj4.interpolate(method='linear'))

In [54]:
obj4

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [55]:
# remove duplicated values
# the first observed one is kept by default
# return a new DataFrame
frame.drop_duplicates('city')

Unnamed: 0,city,year,population,country
0,Beijing,1990,10.8,China
3,Shanghai,1990,13.3,China


In [56]:
frame['log_population'] = np.log(frame['population'])

In [57]:
frame

Unnamed: 0,city,year,population,country,log_population
0,Beijing,1990,10.8,China,2.379546
1,Beijing,2000,13.6,China,2.61007
2,Beijing,2010,19.6,China,2.97553
3,Shanghai,1990,13.3,China,2.587764
4,Shanghai,2000,16.4,China,2.797281
5,Shanghai,2010,23.0,China,3.135494


In [58]:
# str.lower() return a new column
# assign it the original DataFrame
frame['city'] = frame['city'].str.lower()

In [59]:
frame

Unnamed: 0,city,year,population,country,log_population
0,beijing,1990,10.8,China,2.379546
1,beijing,2000,13.6,China,2.61007
2,beijing,2010,19.6,China,2.97553
3,shanghai,1990,13.3,China,2.587764
4,shanghai,2000,16.4,China,2.797281
5,shanghai,2010,23.0,China,3.135494


In [60]:
# write data to a text file
# sep is the delimiter for the output file
# in this case, use a space to separate values
# index is a boolean: whether write row names (index)
# columns is the columns to write
frame.to_csv('frame.txt', sep=' ', index=False,
             columns=['city', 'year', 'population'])

In [61]:
# read data from a text file
frame_new = pd.read_csv('frame.txt', sep=' ')
frame_new

Unnamed: 0,city,year,population
0,beijing,1990,10.8
1,beijing,2000,13.6
2,beijing,2010,19.6
3,shanghai,1990,13.3
4,shanghai,2000,16.4
5,shanghai,2010,23.0


In [62]:
# this enables interactive visualization
# e.g., you can zoom in/out on the plot
%matplotlib notebook

In [63]:
s = pd.Series(np.random.randn(10).cumsum(),
              index=np.arange(0, 100, 10))

In [64]:
s.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x11a18f748>

In [65]:
# loading matplotlib library
import matplotlib.pyplot as plt

In [66]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
ax4 = fig.add_subplot(2, 2, 4)
ax1.hist(np.random.randn(100),
         bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30),
            np.arange(30) + 3 * np.random.randn(30))
ax3.plot(np.random.randn(50).cumsum(), 'k-.')
ax4.plot(np.sin(np.random.randn(100).cumsum()), 'k--')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x11a2f8ba8>]

## statsmodels

In [67]:
# ignore some warning messages before importing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# codes modified from Python for Data Analysis
# by Wes McKinney
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [68]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [69]:
# generate random data

# number of observations
N=100

# construct values of independent variable
X = np.c_[dnorm(0, 0.4, size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N)]

# random noise
eps = dnorm(0, 0.1, size=N)

# regression coefficients
beta = [0.1, 0.3, 0.5]

# construct values of dependent variable
y = np.dot(X, beta) + eps

In [70]:
# create a DataFrame for the data
data = pd.DataFrame(X, columns=['x1', 'x2', 'x3'])
data['y'] = y

In [71]:
data.head()

Unnamed: 0,x1,x2,x3,y
0,0.113386,0.328887,0.176896,0.281413
1,1.5317,0.744785,0.030383,0.574844
2,-0.343157,1.560617,-0.09971,0.964428
3,1.166322,-0.420031,0.495947,-0.386127
4,0.639385,0.019169,0.264958,0.21334


In [72]:
# fit regression model with the data
# conceptual relationship:
# y = x1 + x2 + x3
# estimation method: OLS
model = smf.ols('y ~ x1 + x2 + x3', data=data)
results = model.fit()

In [73]:
# estimated regression coefficients
# Intercept is beta_0
results.params

Intercept   -0.018570
x1           0.134576
x2           0.318066
x3           0.389104
dtype: float64

In [74]:
# visualization
regression_fig = sm.graphics.plot_regress_exog(results, "x1")

<IPython.core.display.Javascript object>