# 5.1 Introduction to pandas Data Structures

In [1]:
import pandas as pd

### Series

In [2]:
import numpy as np


In [3]:
from pandas import Series, DataFrame

In [4]:
obj = pd.Series([4, 7, -5, 3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

And so on...

In [6]:
obj.array

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2["a"]

-5

In [12]:
obj2["d"] = 6

In [13]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [14]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [15]:
import numpy as np

In [16]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [17]:
"b" in obj2

True

In [18]:
"e" in obj2

False

In [19]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}


In [20]:
obj3 = pd.Series(sdata)

In [21]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [22]:
states = ["California", "Ohio", "Oregon", "Texas"]

In [23]:
obj4 = pd.Series(sdata, index= states)

In [24]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [25]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [26]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [27]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [28]:
obj4.notna()


California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [29]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [30]:
type(obj3)

pandas.core.series.Series

In [31]:
obj3.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object')

In [32]:
obj4


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [33]:
type(obj4)

pandas.core.series.Series

In [34]:
obj4 + obj3

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [35]:
obj4.name = "population"

In [36]:
obj4.index.name = "state"

In [37]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [38]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [39]:
obj.index = ["Bob","Steve","Jeff","Ryan","chris"]

ValueError: Length mismatch: Expected axis has 4 elements, new values have 5 elements

In [None]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [None]:
frame = pd.DataFrame(data)

In [None]:
frame

In [None]:
frame.head()

In [None]:
frame.tail

In [None]:
pd.DataFrame(data, columns=["years","state","pop"])

In [None]:
frame2 = pd.DataFrame(data, columns=["years","state","pop","debt"])

In [None]:
frame2

In [None]:
frame2.columns

In [None]:
frame["state"]

In [None]:
frame2.years

In [None]:
frame2.state

In [None]:
frame2.loc[2]

In [None]:
frame2.iloc[2]

In [None]:
frame2["debt"] = 16.5

In [None]:
frame2

In [None]:
frame2["debt"] = np.arange(6.)

In [None]:
frame2

In [None]:
val = pd.Series([-1.2, -1.5, -1.7], index=[2,4,5])

In [None]:
frame2["debt"] = val

In [None]:
frame2

In [None]:
frame2["eastern"] = frame2["state"] == "Ohio"

In [None]:
frame2

In [None]:
del frame2["eastern"]

In [None]:
frame2.columns

In [None]:
In [72]: populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
   ....:                "Nevada": {2001: 2.4, 2002: 2.9}}

In [None]:
frame3  = pd.DataFrame(populations)

In [None]:
frame3

In [None]:
frame3.T

In [None]:
frame3

In [None]:
pd.DataFrame(populations, index=[2001,2002,2003]).T

In [None]:
In [77]: pdata = {"Ohio": frame3["Ohio"][:-1],
   ....:          "Nevada": frame3["Nevada"][:2]}

In [None]:
pd.DataFrame(pdata)

In [None]:
frame3.to_numpy()

In [None]:
frame2.to_numpy

In [None]:
obj = pd.Series(np.arange(3), index=["a","b","c"])

In [None]:
index = obj.index

In [None]:
index

In [None]:
index[1:]

In [None]:
labels = pd.Index(np.arange(3))

In [None]:
labels

In [None]:
obj2 = pd.Series([1.5, -2.5,0 ], index=labels)

In [None]:
obj2

In [None]:
obj2.index is labels

In [None]:
frame3

In [None]:
frame3.index.name = "year"

In [None]:
frame3.columns.name = "state"

In [None]:
frame3

In [None]:
frame3.columns

In [None]:
"Ohio" in frame3.columns

In [None]:
2003 in frame3.index

In [None]:
pd.Index(["foo","foo","bar","bar"])


Table 5.1: Possible data inputs to the DataFrame constructor

vvvvvvvvvv

Type

    -notes


vvvvvvvvvv   


2D ndarray	
    
    -A matrix of data, passing optional row and column labels



Dictionary of arrays, lists, or tuples	
    
    -Each sequence becomes a column in the 
    DataFrame; all sequences must be the same length



NumPy structured/record array	

    -Treated as the “dictionary of arrays” case



Dictionary of Series	

    -Each value becomes a column; indexes from each Series are
    unioned together to form the result’s row index if no explicit index is passed



Dictionary of dictionaries

    -Each inner dictionary becomes a column; keys are unioned to form the row index
    as in the “dictionary of Series” case




List of dictionaries or Series
    
    -Each item becomes a row in the DataFrame; unions of dictionary keys or Series
    indexes become the DataFrame’s column labels



List of lists or tuples	
   
    -Treated as the “2D ndarray” case



Another DataFrame	
   
    -The DataFrame’s indexes are used unless different ones are passed



NumPy MaskedArray	
    
    -Like the “2D ndarray” case except masked values are missing in the DataFrame
    result

In [None]:
obj = pd.Series([4.5, 7.2, -5.3,3.6], index=["d","b","a","c"])

In [None]:
obj

In [None]:
obj2 = obj.reindex(["a","b","c","d","e"])

In [None]:
obj2

In [None]:
obj3 = pd.Series(["blue","purple","yellow"], index=[0,2,4])

In [None]:
obj3

In [None]:
obj3.reindex(np.arange(6), method="ffill")

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=["a","c","d"], columns=["Ohio","Texas","California"])

In [None]:
frame2 = frame.reindex(index=["a","b","c","d"])

In [None]:
frame2

In [None]:
states = ["Texas", "Utah", "California"] 

In [None]:
frame.reindex(columns=states)

In [None]:
frame.loc[["a","d","c"], ["California","Texas"]]

In [None]:
obj = pd.Series(np.arange(5.), index=["a","b","c","d","e"])

In [None]:
obj

In [None]:
new_obj = obj.drop("c")

In [None]:
new_obj

In [None]:
obj.drop(["d","c"])

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=["Ohio","Colorado","Utah","New York"], columns=["one","two","three","four"]) 

In [None]:
data

In [None]:
data.drop(index=["Colorado","Ohio"])

In [None]:
data

In [None]:
data.drop(columns=["two"])

In [None]:
data.drop("two",axis=1)

In [None]:
data.drop(["two","four"], axis="columns")

In [None]:
data

In [None]:
obj = pd.Series(np.arange(4.), index=["a","b","c","d"])

In [None]:
obj

In [None]:
obj["b"]

In [None]:
obj[1]

In [None]:
obj[2:4]

In [None]:
obj[["b","a","d"]]

In [None]:
obj.iloc[[1,3]]

In [None]:
obj1 = pd.Series([1,2,3], index=[2,0,1])

In [None]:
obj2 = pd.Series([1,2,3], index=["a","b","c"])

In [None]:
obj1

In [None]:
obj2

In [None]:
obj1[[0,1,2]]

In [None]:
obj2.iloc[[0,1,2]]

In [None]:
obj2.loc[[0,1]]

In [None]:
obj1.iloc[[0,1,2]]

In [None]:
obj2.iloc[[0,1,2]]

In [None]:
obj2.loc["b":"c"]

In [None]:
obj2.loc["b":"c"] = 5

In [None]:
obj2

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=["Ohio","Colorado","Utah","New York"], columns=["one","two","three","four"])

In [None]:
data

In [None]:
data["two"]

In [None]:
data[["three","one"]]

In [None]:
data[:2]

In [None]:
data[data["three"] > 7 ]

In [None]:
data < 5 

In [None]:
data[data < 5] = 0

In [None]:
data

In [None]:
data.loc["Colorado"]

In [None]:
data.loc[["Colorado","New York"]]

In [None]:
data.loc[["Colorado"], ["two","three"]]

In [None]:
data.iloc[2]

In [None]:
data.iloc[2,[3,0,1]]

In [None]:
data.iloc[[1,2],[3,0,1]]

In [None]:
data.loc[:"Utah",:"two"]

In [None]:
data.iloc[:,:3][data.three > 5]

We can index with two slices. that's pretty cool!

In [None]:
data.loc[data.three >= 2]


Table 5.4: Indexing options with DataFrame

df[column]	Select single column or sequence of columns from the DataFrame; 
special case conveniences: Boolean array (filter rows), slice (slice rows), or Boolean DataFrame (set values based on some criterion)

df.loc[rows]	Select single row or subset of rows from the DataFrame by label

df.loc[:, cols]	Select single column or subset of columns by label

df.loc[rows, cols]	Select both row(s) and column(s) by label

df.iloc[rows]	Select single row or subset of rows from the DataFrame by integer position

df.iloc[:, cols]	Select single column or subset of columns by integer position

df.iloc[rows, cols]	Select both row(s) and column(s) by integer position

df.at[row, col]	Select a single scalar value by row and column label

df.iat[row, col]	Select a single scalar value by row and column position (integers)

reindex method	Select either rows or columns by labels

In [None]:
ser = pd.Series(np.arange(3.))

In [None]:
ser

In [None]:
ser[-1]

In [None]:
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=["a","b","c"])

In [None]:
ser2.iloc[-1]

In [None]:
ser[:2]

In [None]:
data.loc[:,"one"] = 1

In [None]:
data

In [None]:
data.iloc[2]= 5

In [None]:
data

In [None]:
data.loc[data.four > 5] = 3

In [None]:
data

In [None]:
data.loc[data.three == 5]["three"] = 6

In [None]:
data

In these scenarios, the fix is to rewrite the chained assignment to use a single loc operation:

In [None]:
data.loc[data.three == 5, "three" ]=6

In [None]:
data

In [None]:
s1 = pd.Series([7.3,-2.5,3.4,1.5], index=["a","c","d","e"])

In [None]:
s2 = pd.Series([-2.1,3.6,-1.5, 4, 3.1], index=["a","c","e","f","g"])

In [None]:
s1

In [None]:
s2

In [None]:
s1 + s2

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list("bcd"), index=["Ohio","Texas","Colorado"])

In [None]:
df1

In [None]:
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("bde"), index=["Utah","Ohio","Texas","Oregon"])

In [None]:
df2

In [None]:
df1 + df2

only the rows and comlumns with matching names get added the rest return NAN 

In [None]:
df1 = pd.DataFrame({"A":[1,2]})

In [None]:
df2 = pd.DataFrame({"B": [3,4]})

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list("abcd"))

In [None]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list("abcde"))

In [None]:
df1

In [None]:
df2

In [None]:
df2.loc[1,"b"] = np.nan

In [None]:
df2

In [None]:
df1 + df2

In [None]:
df1.add(df2, fill_value=0)

used add() with fill_value=0 to fill in the blanks using the original values from df1. you can alter the values when added by changing "fill_value"

In [None]:
1/ df1

In [None]:
df1

In [None]:
df1.rdiv(1)


Table 5.5: Flexible arithmetic methods

Method	Description

add, radd	Methods for addition (+)

sub, rsub	Methods for subtraction (-)

div, rdiv	Methods for division (/)

floordiv, rfloordiv	Methods for floor division (//)

mul, rmul	Methods for multiplication (*)

pow, rpow	Methods for exponentiation (**)


In [None]:
arr = np.arange(12.).reshape((3,4))

In [None]:
arr

In [None]:
arr[0]

In [None]:
arr - arr[0]

In [None]:
arr - arr[:] 

had no idea you could do that! cool!

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("bde"), index=["Utah","Ohio","Texas","Oregon"])

In [None]:
series = frame.iloc[0]

In [None]:
frame

In [None]:
series

In [None]:
frame - series

In [None]:
series2 = pd.Series(np.arange(3), index=list("bef"))

In [None]:
series2

In [None]:
frame + series2

In [None]:
series3 = frame["d"]

frame

In [None]:
series3

In [None]:
frame.sub(series3, axis="index")

axis="index" in this example subtracts "d" from each column

Funtion Application and Mapping

In [None]:
frame = pd.DataFrame(np.random.standard_normal((4,3)), columns=list("bde"),
                    index=["Utah","Ohio","Texas","Oregon"])

In [None]:
frame

In [None]:
np.abs(frame)

In [None]:
def f1(x):
    return x.max() - x.min()

we can apply this to our frame

In [None]:
frame.apply(f1)

this the function returns the differnce of the max and min of each column (refer to the cell above the abs(frame) cell)

If you pass axis="columns" to apply, the function will be invoked once per row instead.

apply is only used when you dont need to return a scalar value (sum or mean. etc)

In [None]:
frame

In [None]:
def f2(x):
    return pd.Series([x.min(),x.max()], index=["min","max"])

In [None]:
frame.apply(f2)

you can apply to every element using Dataframe.map()

In [None]:
def my_format(x):
    return f"{x:.2f}"

In [None]:
frame

In [None]:
frame.map(my_format)

In [None]:
frame["e"].map(my_format)

it works with series too!

In [None]:
obj = pd.Series(np.arange(4), index=["d","b","a","c"])

In [None]:
obj

you can sort a Series using .sort_index()

In [None]:
obj.sort_index()

In [None]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=["three","one"], columns=["d","a","b","c"])

In [None]:
frame

df's can be sorted using either axis.

In [None]:
frame.sort_index()

In [None]:
frame.sort_index(axis="columns")

In [None]:
frame.sort_index(axis="columns", ascending=False)

you can also sort by values.

In [None]:
obj = pd.Series([4, 7, -3, 2])

In [None]:
obj.sort_values()

even with missing values

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [None]:
obj.sort_values()

you can change the position of the missing values

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [None]:
obj.sort_values(na_position="first")

when sorting dfs you can do so by column

In [None]:
frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})

In [None]:
frame

In [None]:
frame.sort_values("b")

In [None]:
frame.sort_values("b")

with multiple columns as well

In [None]:
frame.sort_values(["a","b"])

Ranking assigns ranks from one through the number of valid data points in an array, starting from the lowest value. The rank methods for Series and DataFrame are the place to look; by default, rank breaks ties by assigning each group the mean rank

In [None]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [None]:
obj.rank()

Ranks can also be assigned according to the order in which they’re observed in the data:


In [None]:
obj.rank(method="first")

Here, instead of using the average rank 6.5 for the entries 0 and 2, they instead have been set to 6 and 7 because label 0 precedes label 2 in the data.

you can also rank in descendig order

In [None]:
obj.rank(ascending=False)

df can compute ranks over rows or columns

In [None]:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],"c": [-2, 5, 8, -2.5]})

In [None]:
frame

In [None]:
frame.rank(axis="columns")

"average"	Default: assign the average rank to each entry in the equal group
"min"	Use the minimum rank for the whole group
"max"	Use the maximum rank for the whole group
"first"	Assign ranks in the order the values appear in the data
"dense"	Like method="min", but ranks always increase by 1 between groups rather than the number of equal elements in a group

Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])

In [None]:
obj

you can find duplicate with .is_unique

In [None]:
obj.index.is_unique

if there are doubles when you loc or access that label by name all associated rows or columns will be shown

In [None]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"])

In [None]:
df

Calling DataFrame’s sum method returns a Series containing column sums:

In [None]:
df.sum()

When an entire row or column contains all NA values, the sum is 0, whereas if any value is not NA, then the result is NA. This can be disabled with the skipna option, in which case any NA value in a row or column names the corresponding result NA:

In [None]:
df.sum(axis="index", skipna=False)

In [None]:
df.sum(axis="columns", skipna=False)

Some aggregations, like mean, require at least one non-NA value to yield a value result

In [None]:
df.mean(axis="columns")

reduction methods


axis	Axis to reduce over; "index" for DataFrame’s rows and "columns" for columns


skipna	Exclude missing values; True by default


level	Reduce grouped by level if the axis is hierarchically indexed (MultiIndex)

you can return inderect stats as well. for instance idxmin or idxmax will return the index where the min or max are located

In [None]:
df.idxmax()

Other methods are accumulations

In [None]:
df.cumsum()

describe() produce multiple summary stats in one shot

In [None]:
df.describe()

On nonnumeric data, describe produces alternative summary statistics:

In [None]:
obj = pd.Series(["a", "a", "b", "c"] * 4)

In [None]:
obj.describe()

Table 5.8: Descriptive and summary statistics
Method	Description


count	Number of non-NA values


describe	Compute set of summary statistics


min, max	Compute minimum and maximum values
argmin, argmax	Compute index locations (integers) at which minimum or maximum value is obtained, respectively; not available on DataFrame objects


idxmin, idxmax	Compute index labels at which minimum or maximum value is obtained, respectively


quantile	Compute sample quantile ranging from 0 to 1 (default: 0.5)


sum	Sum of values


mean	Mean of values


median	Arithmetic median (50% quantile) of values


mad	Mean absolute deviation from mean value


prod	Product of all values


var	Sample variance of values


std	Sample standard deviation of values


skew	Sample skewness (third moment) of values


kurt	Sample kurtosis (fourth moment) of values


cumsum	Cumulative sum of values


cummin, cummax	Cumulative minimum or maximum of values, respectively


cumprod	Cumulative product of values


diff	Compute first arithmetic difference (useful for time series)


pct_change	Compute percent changes

In [None]:
conda install pandas-datareader

In [None]:
import pandas_datareader.data as web
import datetime as dt

In [None]:
pip install "yfinance[optional]"

In [None]:
import yfinance as yf

In [None]:
tickers = yf.Tickers('msft ibm aapl goog')

In [None]:
tickers.tickers['MSFT'].info

this is so cool

In [None]:
tickers.tickers['AAPL'].history(period="1mo")

In [48]:
price = pd.read_pickle("examples/yahoo_price.pkl")

In [51]:
returns = price.pct_change()

In [53]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [47]:
volume = pd.read_pickle("examples/yahoo_volume.pkl")

In [46]:
volume

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400
...,...,...,...,...
2016-10-17,23624900,1089500,5890400,23830000
2016-10-18,24553500,1995600,12770600,19149500
2016-10-19,20034600,116600,4632900,22878400
2016-10-20,24125800,1734200,4023100,49455600


the corr method is used here to compute the correlatioon on overlappping, non NA, index aligned values in two series  

In [55]:
returns["MSFT"].corr(returns["IBM"])

0.49976361144151155

the the covariance is described with the same arguments here

In [56]:
returns["MSFT"].cov(returns["IBM"])

8.870655479703549e-05

the df corr and cov return a full corr or cov matrix as a DataFrame

In [57]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [58]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


pair-wise correlations between DataFrame’s columns or rows with another Series or DataFrame can be computed with     corrwith

In [59]:
returns.corrwith(returns["IBM"])

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

you can compute correlations of percent changes with corrwith as well

In [60]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

Passing axis="columns" does things row-by-row instead. In all cases, the data points are aligned by label before the correlation is computed.

Unique Values:

In [62]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [65]:
uniques = obj.unique()

In [66]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

uniques are not always returned in the order they are recieved. to do so, use sort

In [69]:
uniques.sort()

In [73]:
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

value_counts computes a Series containing value frequencies:

In [74]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

The Series is sorted by value in descending order as a convenience. value_counts is also available as a top-level pandas method that can be used with NumPy arrays or other Python sequences:


In [75]:
pd.value_counts(obj.to_numpy(), sort=False)

  pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [76]:
pd.Series(obj).value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

isin performs a vectorized set membership check and can be useful in filtering a dataset down to a subset of values in a Series or column in a DataFrame:

In [77]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [78]:
mask = obj.isin(["b","c"])

In [79]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

boolean. based on the specified value in a seires

In [80]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

returns the values and index

Related to isin is the Index.get_indexer method, which gives you an index array from an array of possibly nondistinct values into another array of distinct values:

In [81]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])

In [83]:
unique_vals = pd.Series(["c", "b", "a"])


In [84]:
indecies = pd.Index(unique_vals).get_indexer(to_match)

In [85]:
indecies

array([0, 2, 1, 1, 0, 2])


Table 5.9: Unique, value counts, and set membership methods


isin	Compute a Boolean array indicating whether each Series or DataFrame value is contained in the passed sequence of values


get_indexer	Compute integer indices for each value in an array into another array of distinct values; helpful for data alignment and join-type operations


unique	Compute an array of unique values in a Series, returned in the order observed


value_counts	Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order    

histogram example for multiple related comlumns in a DF

In [87]:
data = pd.DataFrame({"Qu1":[1,3,4,3,4],"Qu2":[2,3,1,2,3],"Qu3":[1,5,2,4,4],})

In [88]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


for value counts pertaining to a single column. (below)

In [90]:
data["Qu1"].value_counts().sort_index()

Qu1
1    1
3    2
4    2
Name: count, dtype: int64

to compute this for all columns we can use the apply

In [91]:
result = data.apply(pd.value_counts).fillna(0)

  result = data.apply(pd.value_counts).fillna(0)


In [95]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [96]:
data = pd.DataFrame({"a": [1,1,1,2,2], "b": [0,0,1,0,0]})

In [97]:
data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [99]:
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64

the end.