# Notebook Numpy & Pandas
## Introduccion a la Ciencia de Datos

### Juan Francisco Regalado Heras
Maestria en Ciencias de la Computacion, CICESE

### Numpy
The NumPy library is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays.

In [None]:
pip install numpy

In [None]:
import numpy as np

## -> Create Arrays

In [None]:
a = np.array([1, 2, 3])
b = np.array([(1.5, 2, 3), (4, 5, 6)], dtype = float)
c = np.array([[(1.5, 2, 3), (4, 5, 6)],[(3, 2, 1), (4, 5, 6)]], dtype = float)

### Initial Placeholders

In [None]:
np.zeros((3, 4)) # Create an array of zeros
np.ones((2, 3, 4), dtype = np.int16) #Create an array of ones
d = np.arange(10, 25, 5) #Create an array of evenly spaced values (step value)
np.linspace(0, 2, 9) # Create a constant array
e = np.full((2, 2), 7) # Create a constant array
f = np.eye(2) # Create a 2x2 identity matrix
np.random.random((2, 2)) # Create an array with random values
np.empty((3, 2)) # Create an empty array

## -> I / O
### Saving & Loading On Disk

In [None]:
np.save('my_array', a)
np.savez('array.npz', a, b)
np.load('my_array.npy')

### Saving & Loading Text Files

In [None]:
np.loadtxt("myfile.txt")
np.genfromtxt("my_file.csv", delimiter=',')
np.savetxt("myarray.txt", a, delimiter=" ")


## -> Asking For Help

In [None]:
np.info(np.ndarray.dtype)

## -> Inspecting Your Array

In [None]:
a.shape # Array dimensions
len(a) # Length of array
b.ndim # Number of array dimensions
e.size # Number of array elements
b.dtype # Data type of array elements
b.dtype.name # Name of data type
b.astype(int) # Convert an array to a different type


## -> Data Types

In [None]:
np.int64  # Signed 64-bit integer types  
np.float32   # Standard double-precision floating point 
np.complex   # Complex numbers represented by 128 floats
np.bool  # Boolean type storing TRUE and FALSE values
np.object  # Python object type  
np.string_   # Fixed-length string type  
np.unicode_   # Fixed-length unicode type 

## -> Array Mathematics

### Arithmetic Operations

In [None]:
g = a - b    
np.subtract(a,b)   

b + a        
np.add(b,a)  

a / b    
np.divide(a,b)

a * b         
np.multiply(a,b)  

np.exp(b)     
np.sqrt(b)  
np.sin(a)  
np.cos(b)   
np.log(a)     
e.dot(f)      
np.array([[7. , 7. ], [7. , 7.]])  


### Comparison

In [None]:
a = b # Element-wise comparison
np.array([[False, True, True], [False, False, False]], dtype = bool)
a < 2 # Element-wise comparison
np.array([True, False, False], dtype = bool)
np.array_equal(a, b) # Array-wise comparison

### Aggregate Functions

In [None]:
a.sum() # Array-wise sum
a.min() # Array-wise minimum value
b.max(axis = 0) # Maximum value of an array row
b.cumsum(axis = 1) # Cumulative sum of the elements
a.mean() # Mean
np.median(b) # Median
np.corrcoef(a) # Correlation coefficient
np.std(b) # Standard deviation

## -> Copying Arrays

In [None]:
h = a.view() # Create a view of the array with the same data
np.copy(a) # Create a copy of the array
h = a.copy() # Create a deep copy of the array

## -> Sorting Arrays

In [None]:
a.sort() # Sort an array
c.sort(axis = 0) # Sort the elements of an array's axis

## -> Subsetting, Slicing, Indexing

In [None]:
#Subsetting
a[2] # Select the element at the 2nd index
b[1, 2] # Select the element at row 1 column 2

In [None]:
#Slicing
a[0 : 2] # Select items at index 0 and 1
b[0:2, 1] #Select items at rows 0 and 1 in column1
b[:1] # Select all items at row 0 
c[1, ...] #Same as [1, :, :]
a[: :-1] # Reversed array a array [3, 2, 1]


In [None]:
#Boolean indexing
a[ a < 2 ] #Select elements a less than 2


In [None]:
#Fancy Indexing
b[[1, 0, 1, 0], [0, 1, 2, 0]] # Select elements (1,0), (0,1), (1,2) and (0,0)
b[[1, 0, 1, 0]][:,[0, 1, 2, 0]] # Select a subset of the matrix's rows and columns

## -> Array Manipulation

In [None]:
#Transposing Array
i = np.transpose(b) #Permute array dimensions
i.T # Permute array dimensions


In [None]:
#Changing Array Shape
b.ravel() # Flatten the array
g.reshape(3, -2) # Reshape, but don't change data


In [None]:
#Adding/Removing Elements
h.resize((2, 6)) #Return a new array with shape (2, 6)
np.append(h, g) # Append items to an array
np.insert(a, 1, 5) # Insert items in an array
np.delete(a, [1]) # Delete items from array

In [None]:
#Combining Arrays
np.concatenate((a, d), axis = 0) # Concatenate arrays
np.vstack((a, b)) #Stacck arrays vertically (row-wise)
np.r_[e, f] # Stack arrays vertically (row-wise)
np.hstack((e, f)) # Stack arrays horizontally (column-wise)
np.column_stack((a, d)) # Create stacked colum-wise arrays
np.c_[a, d] # Create stacked column-wise arrays

In [None]:
#Splitting Arrays
np.hsplit(a, 3) # Split the array horizontally at the 3rd index
np.vsplit(c, 2) # Split the array vertically at the 2nd index

# Data Wrangling 
### with pandas 

In [None]:
pip install pandas


In [None]:
import pandas as pd

### Creating DataFrames

In [None]:
df = pd.DataFrame({"a" : [4, 5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}, index = [1, 2, 3]) #Specify Values for each column

In [None]:
df = pd.DataFrame([[4, 7, 10], [5, 8, 11], [6, 9, 12]],index = [1, 2, 3], columns = ['a', 'b', 'c']) #Specify values for each row

In [None]:
df = pd.DataFrame({"a" : [4, 5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}, index = pd.MultiIndex.from_tuples([('d', 1), ('d', 2), ('e', 2)], names = ['n', 'v']))

### Method Chaining 
Most pandas methods return a DatFrame so that another pandas method can be applied to the result. This improves readability of code.

In [None]:
df = (pd.melt(df).rename(columns={'variable' : 'var', 'value' : 'val'}).query('val >= 200'))

### Reshaping Data


In [None]:
pd.melt(df) # Gather columns into rows
pd.concat([df1, df2]) # Append rows of DataFrames
df.pivot(columns = 'var', values = 'val') # Spread rows into columns
pd.concat([df1, df2], axis = 1) #Append columns of DataFrames
df.sort_values('mpg') # order rows by values of a column (low to high)
df.sort_values('mpg', ascending = False) #Order rows by values of a column (high to low)
df.rename(columns = {'y':'year'}) # rename the columns of a DataFrame
df.reset_index() #Reset index of DataFrame to row numbers, moving index to columns
df.drop(columns = ['Length', 'height']) #Drop columns from DataFrame

In [None]:
#Subset Observations - rows
df[df.Length > 7] #Extract rows that meet logical criteria
df.drop_duplicates() #Remove duplicate rows (only considers columns)
df.sample(frac=0.5) #Randomly select fraction of rows
df.sample(n=10) #Randomly select n rows
df.nlargest(n, 'value') #Select and order top n entries
df.nsmallest(n, 'value') #Select and order bottom n entries
df.head(n) #Select first n rows
df.tail(n) #Select last n rows

#Subset Varuables - columns
df[['width', 'length', 'species']] #Select multiple columns with specific names
df['width'] # or df.width Select single column with specific name
df.filter(regex='regex') #Select columns whose name matches regular expression regex

#Using query
# query() allows Boolean expressions for filtering rows
df.query('Length > 7')
df.query('Length > 7 and Width < 8')
df.query('Name.str.startswith("abc")', engine="python")

#Subsets - rows and columns
# Use df.loc[] and df.iloc[] to select only rows, only columns or both. Use df.at[] and df.iat[] to access a single 
# value by row and column
#First index selects rows, second index columns
df.iloc[10:20] #Select rows 10-20
df.iloc[:, [1, 2, 5]] #Select columns in positions 1, 2 and 5 (first column is 0)
df.loc[:, 'x2':'x4'] #Select all columns between x2 and x4 (inclusive)
df.loc[df['a'] > 10, ['a', 'c']] #Select rows meeting logical condition, and only the specific columns
df.iat[1, 2] #Access single value by index
df.at[4, 'A'] #Access single value by label

### Summarize Data

In [None]:
df['w'].value_counts() #Count number of rows with each unique value of variable
len(df) # Numbers of rows in DataFrame
df.shape #Tuple of # of rows, # of columns in DataFrame
df['w'].nunique() # Numbers of distinct values in a column
df.describe() # Basic descriptive and statistics for each column (or GroupBy)

pandas provides a large set of summary functions that operate on different kinds of pandas objects (DataFrame columns, Series,
GroupBy, Expanding and Rolling (see below)) and produce single values for each of the groups. When applied to a DataFrame, the
result is returned as a pandas Series for each column. Examples:

In [None]:
sum() #Sum values of each object
count() #Count non-NA/null values of each object
median() #Median value of each object
quantile([0.25,0.75]) #Quantiles of each object
apply(function) #Apply function to each object
min() #Minimum value in each object
max() #Maximum value in each object
mean() #Mean value of each object
var() #Variance of each object
std() #Standard deviation of each object

### Handling Missing Data

In [None]:
df.dropna() #Drop rows with any column having NA/null data
df.fillna(value) #Replace all NA/null data with value

### Make New Columns

In [None]:
df.assign(Area=lambda df: df.Length*df.Height) #Compute and append one or more new columns
df['Volume'] = df.Length*df.Height*df.Depth #Add single column
pd.qcut(df.col, n, labels=False) #Bin column into n buckets

pandas provides a large set of vector functions that operate on all
columns of a DataFrame or a single selected column (a pandas
Series). These functions produce vectors of values for each of the
columns, or a single Series for the individual Series. Examples:

In [None]:
max(axis=1) #Element-wise max
clip(lower=-10,upper=10) #Trim values at input thresholds
min(axis=1) #Element-wise min
abs() #Absolute value

### Group Data


In [None]:
df.groupby(by="col") #Return a GroupBy object, grouped by values in column named "col"
df.groupby(level="ind") #Return a GroupBy object, grouped by values in index level named "ind"

All of the summary functions listed above can be applied to a group. Additional GroupBy functions:

In [None]:
size() #Size od each group
agg(fuction) #Aggregate group using function

The examples below can also be applied to groups. In this case, the function is applied on a per-group basis, and the returned vectors
are of the length of the original DataFrame.

In [None]:
shift(1) #Copy with values shifted by 1
rank(method='dense') #Ranks with no gaps
rank(method='min') #Ranks. Ties get min rank
rank(pct=True) #Ranks rescaled to interval [0, 1]
rank(method='first') #Ranks. Ties go to first value
shift(-1) #Copy with values lagged by 1
cumsum() #Cumulative sum
cummax() #Cumulative max
cummin() #Cumulative min
cumprod() #Cumulative product

### Windows

In [None]:
df.expanding() # Return an Expanding object allowing summary functions to be applied cumulatively
df.rolling(n) # Return a Rolling object allowing summary functions to be applied to windows of length n

### Plotting

In [None]:
df.plot.hist() #Histogram for each column
df.plot.scatter(x='w',y='h') #Scatter chart using pairs of points

### Combine Data Sets

In [None]:
# Standard Joins
pd.merge(adf, bdf, how='left', on='x1') #Join matching rows from bdf to adf
pd.merge(adf, bdf, how='right', on='x1') #Join matching rows from adf to bdf
pd.merge(adf, bdf, how='inner', on='x1') #Join data. Retain only rows in both sets
pd.merge(adf, bdf, how='outer', on='x1') #Join data. Retain all values, all rows

# Filtering Joins
adf[adf.x1.isin(bdf.x1)] #All rows in adf that have a match in bdf
adf[~adf.x1.isin(bdf.x1)] #All rows in adf that do not have a match in bdf

# Set-like Operations
pd.merge(ydf, zdf) #Rows that appear in both ydf and zdf (Intersection).
pd.merge(ydf, zdf, how='outer') # Rows that appear in either or both ydf and zdf(Union).
pd.merge(ydf, zdf, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge']) #Rows that appear in ydf but not zdf (Setdiff).