In [7]:
%load_ext autoreload
%autoreload 2
import pandas as pd


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### pandas - quickstart tutorial - https://pandas.pydata.org/pandas-docs/version/0.15.2/tutorials.html


<p>A DataFrame is composed of three different components, the <span style="color:blue"> index, columns, and the data.</span>The data is also known as the values.</p>

<img src="img/pandas_df.png" width="600px">

<p>All the values in the index are in bold font. Each individual value of the index is called a label. Sometimes the index is referred to as the row labels.</p>
<p>Each row and each column have a specific label (or integer position) that can be used to reference them.</p>
<p>The documentation uses the term indexing frequently. This term is essentially just a one-word phrase to say ‘subset selection’.</p>



<h5>DF Manipulation</h5>
<p>df.loc[row_labels_selection, column_labels_selection]  -- e.g  df.loc[['Dean', 'Cornelia'], ['age', 'state', 'score']] or df.loc[:'Dean', 'height':]</p>
<p>df.iloc[row_position_selection, column_position_selection]</p>

In [None]:
df = pd.read_csv('teste.csv', delimiter = ',')

In [None]:
df.loc[3:5, 'animal'] 

In [6]:
df.loc[(df.animal =="lion") & (df.water_need > 430) ] #filter

Unnamed: 0,animal,uniq_id,water_need
16,lion,1017,600
17,lion,1018,500


In [7]:
df.set_index(['uniq_id','animal'])[0:3] #setting new indexes

Unnamed: 0_level_0,Unnamed: 1_level_0,water_need
uniq_id,animal,Unnamed: 2_level_1
1001,elephant,500
1002,elephant,600
1003,elephant,550


In [8]:
df.loc[df.water_need.notnull()][0:3]

Unnamed: 0,animal,uniq_id,water_need
0,elephant,1001,500
1,elephant,1002,600
2,elephant,1003,550


In [9]:
df['critic'] = 'everyone' #assigning new column and values

In [10]:
df[0:2]

Unnamed: 0,animal,uniq_id,water_need,critic
0,elephant,1001,500,everyone
1,elephant,1002,600,everyone


In [11]:
df.loc[df.animal.isin(['lion', 'elephant'])][0:5]

Unnamed: 0,animal,uniq_id,water_need,critic
0,elephant,1001,500,everyone
1,elephant,1002,600,everyone
2,elephant,1003,550,everyone
15,lion,1016,420,everyone
16,lion,1017,600,everyone


In [12]:
df.animal.notnull()[0:3]

0    True
1    True
2    True
Name: animal, dtype: bool

In [13]:
df.loc[df.animal.notnull()][:3]

Unnamed: 0,animal,uniq_id,water_need,critic
0,elephant,1001,500,everyone
1,elephant,1002,600,everyone
2,elephant,1003,550,everyone


##### KAGGLE  - Summary functions and maps workbook


In [14]:
df.animal.unique()


array(['elephant', 'tiger', 'zebra', 'lion', 'kangaroo'], dtype=object)

In [15]:
df.water_need.median()

325.0

In [16]:
df.animal.value_counts()  # !!!

zebra       7
tiger       5
lion        4
kangaroo    3
elephant    3
Name: animal, dtype: int64

In [17]:
centered_water_need = df.water_need - df.water_need.mean()

In [18]:
print(centered_water_need[0:4])

0    152.272727
1    252.272727
2    202.272727
3    -47.727273
Name: water_need, dtype: float64


In [19]:
bargain_idx = df.water_need.idxmax() # esse método traz o valor do indice do maior elemento 
bargain_wine = df.loc[bargain_idx, 'water_need']
print(bargain_idx, bargain_wine)

1 600


In [20]:
# precisa fazer cast primeiro para iterar com map - todo
#a = df.water_need.astype(object) 
#other = a.map(lambda desc: 4 in desc).sum()

lions = df.animal.map(lambda desc: "lion" in desc).sum()
zebras = df.animal.map(lambda desc: 'zebra' in desc).sum()
descriptor_counts = pd.Series([lions, zebras], index=['lions', 'zebras'])
print(descriptor_counts)

lions     4
zebras    7
dtype: int64
