## Pandas

### Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
labels = ["a","b","c"]
myList = [10,20,30]

arr = np.array(myList)
arr

array([10, 20, 30])

In [3]:
d = {'a':10,'b':20,'c':30}

In [4]:
pd.Series(myList)

0    10
1    20
2    30
dtype: int64

In [5]:
pd.Series(arr,index = labels)

a    10
b    20
c    30
dtype: int64

In [6]:
ser1 = pd.Series(data = [10,"a",4.4,5],index = ["a",2,"3",2])
ser1[2]

2    a
2    5
dtype: object

In [7]:
series1 = pd.Series([1,2,3,4],index = ["USA","Germany","USSR","Japan"])
series1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [8]:
series1["USA"]

1

In [9]:
series2 = pd.Series([1,4,5,6],index = ["USA","Germany","Italy","Japan"])
series2

USA        1
Germany    4
Italy      5
Japan      6
dtype: int64

In [10]:
series2["Germany"]

4

In [11]:
series1+series2

Germany     6.0
Italy       NaN
Japan      10.0
USA         2.0
USSR        NaN
dtype: float64

### Pandas DataFrame

In [12]:
import pandas as pd
import numpy as np

In [13]:
np.random.seed(101)

rand_mat = np.random.randn(5,4)

In [14]:
rand_mat

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [15]:
df = pd.DataFrame(data = rand_mat,index ="A B C D E ".split(),columns = "F G  H I".split())

df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
type(df['G'])

pandas.core.series.Series

In [17]:
df[['I',"I"]]

Unnamed: 0,I,I.1
A,0.503826,0.503826
B,0.605965,0.605965
C,-0.589001,-0.589001
D,0.955057,0.955057
E,0.683509,0.683509


In [18]:
import random


In [19]:
li = random.sample(list(df.columns.values),2)
print(li)
df["New"] = df[li[0]]+df[li[1]]
df

['G', 'H']


Unnamed: 0,F,G,H,I,New
A,2.70685,0.628133,0.907969,0.503826,1.536102
B,0.651118,-0.319318,-0.848077,0.605965,-1.167395
C,-2.018168,0.740122,0.528813,-0.589001,1.268936
D,0.188695,-0.758872,-0.933237,0.955057,-1.692109
E,0.190794,1.978757,2.605967,0.683509,4.584725


In [20]:
df.drop("New",axis=1,inplace=True)
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
df
df.drop(random.sample(list(df.index.values),2))

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [22]:
df.loc[['A','C']]

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001


In [23]:
df.loc['B']

F    0.651118
G   -0.319318
H   -0.848077
I    0.605965
Name: B, dtype: float64

In [24]:
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
df.loc[['A','B'],['F','G']]

Unnamed: 0,F,G
A,2.70685,0.628133
B,0.651118,-0.319318


In [26]:
df.loc[['A','B']][['F','G']]
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df.loc[len(df.index)] = np.random.randn(len(df.columns))
df
len(df.index)
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509
5,0.302665,1.693723,-1.706086,-1.159119


### Conditional Selection

In [28]:
df_bool = df > 0

In [29]:
df[df_bool]

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509
5,0.302665,1.693723,,


In [30]:
df.iloc[4] = np.random.randn(len(df.columns))
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,-0.134841,0.390528,0.166905,0.184502
5,0.302665,1.693723,-1.706086,-1.159119


In [31]:
df[df['F']>0]

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
5,0.302665,1.693723,-1.706086,-1.159119


In [32]:
df[df['G']>0]
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,-0.134841,0.390528,0.166905,0.184502
5,0.302665,1.693723,-1.706086,-1.159119


In [33]:
# df.drop("level_0",axis=1,inplace = True)
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,-0.134841,0.390528,0.166905,0.184502
5,0.302665,1.693723,-1.706086,-1.159119


In [34]:
new_ind = "CA NY WY OR CO aa bb cc dd dd ee ff gg".split()
new_ind

['CA', 'NY', 'WY', 'OR', 'CO', 'aa', 'bb', 'cc', 'dd', 'dd', 'ee', 'ff', 'gg']

In [36]:
df.set_index("States",inplace = True)

KeyError: "None of ['States'] are in the columns"

In [37]:
display(df.columns.values)
display(df.index.values)

array(['F', 'G', 'H', 'I'], dtype=object)

array(['A', 'B', 'C', 'D', 'E', 5], dtype=object)

In [38]:
df.head()

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,-0.134841,0.390528,0.166905,0.184502


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, A to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       6 non-null      float64
 1   G       6 non-null      float64
 2   H       6 non-null      float64
 3   I       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0+ bytes


In [40]:
df.describe()

Unnamed: 0,F,G,H,I
count,6.0,6.0,6.0,6.0
mean,0.28272,0.395719,-0.313952,0.083538
std,1.515903,0.860445,1.004132,0.801975
min,-2.018168,-0.758872,-1.706086,-1.159119
25%,-0.053957,-0.141857,-0.911947,-0.395625
50%,0.24568,0.50933,-0.340586,0.344164
75%,0.564005,0.712125,0.438336,0.58043
max,2.70685,1.693723,0.907969,0.955057


In [41]:
df

Unnamed: 0,F,G,H,I
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,-0.134841,0.390528,0.166905,0.184502
5,0.302665,1.693723,-1.706086,-1.159119


In [None]:
(df['H']>0).value_counts()

In [None]:
(df['H']>0).sum()

In [None]:
len(df['H']>0)

### GroupBy Operations

In [None]:
import pandas as pd
# Create dataFrame

data = {'Company':["GOOG","GOOG","MSFT","MSFT","FB","FB"],
       'Person':["Sam","Charlie","Amy","Vanessa","Carl","Sarah"],
       'Sales':[200,120,340,124,243,350]}

In [None]:
df = pd.DataFrame(data)
display(df)

In [None]:
df.groupby('Company').describe().transpose()

### Pandas Operations

In [None]:
import pandas as pd

df = pd.DataFrame({"Col1":[1,2,3,4],"Col2":[444,555,666,444],"Col3":['abc','def','ghi','xyz']})
display(df)

In [None]:
df.head()

In [None]:
df['Col2'].unique()

In [None]:
df['Col2'].nunique()

In [None]:
df["Col2"].value_counts()

In [None]:
newdf = df[ (df["Col1"]>2) & (df["Col2"] == 444)]
display(newdf)

In [None]:
def f (x):
    return x**2

In [None]:
f(2)

In [None]:
df["New1"] = df["Col1"].apply(f)
display(df)

In [None]:
del df["New1"]

df

In [None]:
 df.sort_values('Col2',ascending=False)

### Data Input and Output

In [None]:
import pandas as pd

In [None]:
pwd

In [None]:
cd ..

In [None]:
ls 

In [None]:
cd PYTORCH*

In [None]:
cd PYTORCH_NOTEBOOKS

In [None]:
ls

In [None]:
cd 00-Crash-Course-Topics

In [None]:
ls

In [None]:
cd 01-Crash-Course-Pandas

In [None]:
ls

In [None]:
pwd

In [None]:
df1 = pd.read_csv("example.csv")

In [None]:
df1

In [None]:
newdf = df1[['a','b']]

In [None]:
pwd

In [None]:
newdf.to_csv('mynew.csv',index = True)

In [None]:
df = pd.read_excel("Excel_Sample.xlsx",sheet_name = "Sheet1")

In [None]:
df

In [None]:
df.columns

In [None]:
df.drop("Unnamed: 0",axis=1)

In [42]:
df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')

ValueError: No tables found