<center><u><H1>Pandas-Estructuras de datos y Operaciones</H1></u></center>

## Pandas Library has the following main data structures:

1.Series

2.DataFrames

## SERIES:

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.Series(np.random.randn(8))

0   -0.041624
1   -0.687722
2    0.774901
3   -0.791857
4   -0.999843
5    1.433236
6    1.623282
7    0.126792
dtype: float64

### The index of the series can be customized with index values(letters or numbers):

In [3]:
pd.Series(np.random.randn(5), index=[0,1,2,3,4])

0    0.996081
1   -0.363193
2   -0.958665
3   -0.274546
4    1.600161
dtype: float64

### We also can use a Python dict:

In [4]:
d = {'A': 6, 'B': 3, 'C': 'abc', 'D': np.exp(2)}
pd.Series(d)

A          6
B          3
C        abc
D    7.38906
dtype: object

## DataFrame:

<b>DataFrame is a 2D data structure with columns of different datatypes and rows are named index. It can be formed from the following data structures:</b>
1. Numpy array
2. Lists
3. Dicts
4. Series

In [5]:
#using dict of series
d = {'column_1': pd.Series([1,2,3]),
    'column_2': pd.Series(['abc',10.5,'xy'])}
df = pd.DataFrame(d)
df

Unnamed: 0,column_1,column_2
0,1,abc
1,2,10.5
2,3,xy


In [6]:
#using dict of lists
d = {'column_1': [1,2,3],
    'column_2': ['abc',10.5,'xy'],
    'column_3': [14,15,26]}
df = pd.DataFrame(d)
df

Unnamed: 0,column_1,column_2,column_3
0,1,abc,14
1,2,10.5,15
2,3,xy,26


In [7]:
#using numpy array
array = np.array([[0.8, 5.5], [3.7, 12.4]])
df = pd.DataFrame({'Column1': array[:, 0], 'Column2': array[:, 1]}, index=['A','B'])
print(df)

   Column1  Column2
A      0.8      5.5
B      3.7     12.4


### Selection and Indexing:

In [8]:
# selecting a column
df['Column1']

A    0.8
B    3.7
Name: Column1, dtype: float64

In [9]:
# selecting more than one column
df[['Column1','Column2']]

Unnamed: 0,Column1,Column2
A,0.8,5.5
B,3.7,12.4


### <u>loc and iloc:</u>
#### -loc works on labels in the index.
#### -iloc works on the positions in the index (so it only takes integers).

In [10]:
#selecting rows
df.loc['A']

Column1    0.8
Column2    5.5
Name: A, dtype: float64

In [11]:
df.iloc[0]

Column1    0.8
Column2    5.5
Name: A, dtype: float64

In [12]:
type(df['Column1'])

pandas.core.series.Series

### Inserting new column

In [13]:
df['new'] = df['Column1'] + df['Column2']
df

Unnamed: 0,Column1,Column2,new
A,0.8,5.5,6.3
B,3.7,12.4,16.1


### Deleting a column

In [14]:
df.drop('new',axis=1,inplace=True) # use inplace to make changes permanent
df

Unnamed: 0,Column1,Column2
A,0.8,5.5
B,3.7,12.4


### Selecting a subset of the dataframe with rows and columns

In [15]:
df.loc['A','Column1']

0.8

In [16]:
# reset the index
df.reset_index(inplace=True)
df

Unnamed: 0,index,Column1,Column2
0,A,0.8,5.5
1,B,3.7,12.4


In [17]:
df.loc[[0,1],['index','Column2']]

Unnamed: 0,index,Column2
0,A,5.5
1,B,12.4


### Selection by condition:

In [18]:
df[df['Column1']>1][['index','Column2']]

Unnamed: 0,index,Column2
1,B,12.4


In [19]:
# AND condition
df[(df['Column1']>1) & (df['Column2'] > 5)]

Unnamed: 0,index,Column1,Column2
1,B,3.7,12.4


In [20]:
# OR condition
df[(df['Column1']>1) | (df['Column2'] <= 5.5)]

Unnamed: 0,index,Column1,Column2
0,A,0.8,5.5
1,B,3.7,12.4


In [22]:
# NOT condition
df[~((df['Column1']>1) | (df['Column2'] <= 5.5))]

Unnamed: 0,index,Column1,Column2


### Index properties:

In [23]:
#Array of index values
df.index.values

array([0, 1], dtype=int64)

In [24]:
#Using the split function of strings to have a list of items
a = '10 abc'.split()
print(a)

['10', 'abc']


In [25]:
#Inserting new column since list values
df['Column3'] = a
df

Unnamed: 0,index,Column1,Column2,Column3
0,A,0.8,5.5,10
1,B,3.7,12.4,abc


In [26]:
#Inserting new row
df.loc[2]=['C',32,11.8,'xyz']
df

Unnamed: 0,index,Column1,Column2,Column3
0,A,0.8,5.5,10
1,B,3.7,12.4,abc
2,C,32.0,11.8,xyz


## OPERATIONS:

In [27]:
df = pd.DataFrame({'col1':[10,20,30,40,50],'col2':[4,5,6,5,2],'col3':['abc','def','ghi','xyz','123']})
df.head()

Unnamed: 0,col1,col2,col3
0,10,4,abc
1,20,5,def
2,30,6,ghi
3,40,5,xyz
4,50,2,123


In [28]:
df['col2'].sum()

22

In [29]:
(df['col2']==5).sum()

2

In [30]:
df['col2'].count()

5

In [31]:
df['col2'].value_counts()

5    2
6    1
4    1
2    1
Name: col2, dtype: int64

In [32]:
df['col2'].values

array([4, 5, 6, 5, 2], dtype=int64)

In [33]:
a = df.columns.values
s = sorted(a)
print(s)

['col1', 'col2', 'col3']


In [34]:
df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3
4,50,2,123
0,10,4,abc
1,20,5,def
3,40,5,xyz
2,30,6,ghi


In [35]:
df.loc[5]=[np.nan,2,np.nan]
df

Unnamed: 0,col1,col2,col3
0,10.0,4.0,abc
1,20.0,5.0,def
2,30.0,6.0,ghi
3,40.0,5.0,xyz
4,50.0,2.0,123
5,,2.0,


In [36]:
df.dropna(inplace=True)
df

Unnamed: 0,col1,col2,col3
0,10.0,4.0,abc
1,20.0,5.0,def
2,30.0,6.0,ghi
3,40.0,5.0,xyz
4,50.0,2.0,123


In [37]:
df.loc[5]=[np.nan,2,np.nan]
df.fillna('sin valor',inplace=True)
df

Unnamed: 0,col1,col2,col3
0,10,4.0,abc
1,20,5.0,def
2,30,6.0,ghi
3,40,5.0,xyz
4,50,2.0,123
5,sin valor,2.0,sin valor


In [38]:
df.loc[6]=[np.nan,2,np.nan]
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,True,False,True


In [39]:
#Reset index
df.reset_index(inplace=True)
df

Unnamed: 0,index,col1,col2,col3
0,0,10,4.0,abc
1,1,20,5.0,def
2,2,30,6.0,ghi
3,3,40,5.0,xyz
4,4,50,2.0,123
5,5,sin valor,2.0,sin valor
6,6,,2.0,


In [40]:
#Sort by index
df.sort_index(ascending=False,inplace=True)
df

Unnamed: 0,index,col1,col2,col3
6,6,,2.0,
5,5,sin valor,2.0,sin valor
4,4,50,2.0,123
3,3,40,5.0,xyz
2,2,30,6.0,ghi
1,1,20,5.0,def
0,0,10,4.0,abc


In [41]:
#Setting index
df.set_index('col3',inplace=True)
df

Unnamed: 0_level_0,index,col1,col2
col3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,6,,2.0
sin valor,5,sin valor,2.0
123,4,50,2.0
xyz,3,40,5.0
ghi,2,30,6.0
def,1,20,5.0
abc,0,10,4.0


In [42]:
df.reset_index(inplace=True)
df

Unnamed: 0,col3,index,col1,col2
0,,6,,2.0
1,sin valor,5,sin valor,2.0
2,123,4,50,2.0
3,xyz,3,40,5.0
4,ghi,2,30,6.0
5,def,1,20,5.0
6,abc,0,10,4.0


### STRING OPERATIONS:

In [43]:
df['col3'].str.extract('(\w+)',expand=False)

0    NaN
1    sin
2    123
3    xyz
4    ghi
5    def
6    abc
Name: col3, dtype: object

### Filtering:

In [44]:
df[df['col3']=='abc']

Unnamed: 0,col3,index,col1,col2
6,abc,0,10,4.0


### Uppercase/Lowercase:

In [45]:
df['col3'].str.upper()#lower()

0          NaN
1    SIN VALOR
2          123
3          XYZ
4          GHI
5          DEF
6          ABC
Name: col3, dtype: object

### Lenght of the string:

In [46]:
df['col3'].str.len()

0    NaN
1    9.0
2    3.0
3    3.0
4    3.0
5    3.0
6    3.0
Name: col3, dtype: float64

### Split:

In [47]:
df['col3'].str.split('c')

0            NaN
1    [sin valor]
2          [123]
3          [xyz]
4          [ghi]
5          [def]
6         [ab, ]
Name: col3, dtype: object

### Replace:

In [48]:
df['col3'].str.replace(' ','__')

0           NaN
1    sin__valor
2           123
3           xyz
4           ghi
5           def
6           abc
Name: col3, dtype: object

### Contains:

In [49]:
c=df['col3'].str.contains('a')
c

0      NaN
1     True
2    False
3    False
4    False
5    False
6     True
Name: col3, dtype: object

### ONE HOT ENCONDING:

In [50]:
dfhot = pd.DataFrame({'gender':['male','female','male','female','male'],'age_range':['young','adult','senior','young','adult']})
dfhot

Unnamed: 0,gender,age_range
0,male,young
1,female,adult
2,male,senior
3,female,young
4,male,adult


In [51]:
data_dummies = pd.get_dummies(dfhot)
data_dummies

Unnamed: 0,gender_female,gender_male,age_range_adult,age_range_senior,age_range_young
0,0,1,0,0,1
1,1,0,1,0,0
2,0,1,0,1,0
3,1,0,0,0,1
4,0,1,1,0,0


## Reference:

https://pandas.pydata.org/pandas-docs/stable/dsintro.html

https://pandas.pydata.org/pandas-docs/stable/basics.html