In [91]:
import os
import pandas as pd
import numpy as np

print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')

pandas version: 2.2.2
numpy version: 2.0.0


In [92]:
print(os.getcwd())
print(os.listdir())

c:\Users\ccino\Documents\pandas\pandas_exercises\own notebooks
['df_lists.csv', 'lessons.ipynb']


In [93]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
df = pd.read_table(url)
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [94]:
#dataframe creation examples
#from dict
dict_example = {"A" : [1, 2, 3],
                "B" : [4, 5, 6],
                "C" : [7, 8, 9],
                "D" : [10, 11, 12]}

df_dict = pd.DataFrame(dict_example)
df_dict.head()

#from list of lists
df_lists = pd.DataFrame(data = [
                        [1, 2, 3],
                        [4, 5, 6],
                        [7, 8, 9]],
                        columns=['A', 'B', 'C'])
df_lists.head()

#from series
np_generator = np.random.default_rng(seed=12545)
df_series = pd.DataFrame(data = 
                         {"Names" : ["Ana", "Clara", "Julia", "Pedro", "Caio"],
                          "Age" : np_generator.integers(low=10, high=25, size=5),
                          "Gender" : pd.Series(np_generator.choice(["m", "f", pd.NA], size=5, replace=True), dtype='string'),
                          "Number" : np.round(np_generator.uniform(low=10, high=20, size=5),2),
                          "Condition" : np_generator.choice([True, False], size=5)})

df_series.head()

Unnamed: 0,Names,Age,Gender,Number,Condition
0,Ana,17,f,18.87,False
1,Clara,20,m,11.55,True
2,Julia,15,f,18.61,False
3,Pedro,18,,12.48,True
4,Caio,17,f,10.73,False


In [95]:
#dataframe slicing
#by position
df_series.iloc[[1]] #get second line as one line dataframe
df_series.iloc[0:3] #get the first to third line
df_series.iloc[:, [1, 2]] #get all lines from second and third columns only
df_series.iloc[-1] #get last line only, as series

#by name
df_series.index = (['a', 'b', 'c', 'd', 'e']) #changed our index for clarity
df_series.loc['b':'c'] #get line b through c
df_series.loc[:, ['Age', 'Gender']] #get only columns age and gender
df_series.iloc[:3, [df_series.columns.get_loc(x) for x in ['Age', 'Gender']]] #get first 3 lines from columns age and gender

#by boolean indexing
df_series.loc[df_series['Condition'] == True] #get only rows where condition is equal to true
df_series.loc[df_series['Age'] > 15] #get only rows where age is above 15
df_series.loc[(df_series.Age > 15) & ~(df_series.Condition == False)] #get only rows where age is above 15 and condition does not equal False
df_series.loc[pd.isna(df_series['Gender'])] #get rows where gender has a NA value
df_series.loc[(df_series.Condition == False) | (df_series.Gender == "m")] #get rows where condition equals false or gender equals "m"

Unnamed: 0,Names,Age,Gender,Number,Condition
a,Ana,17,f,18.87,False
b,Clara,20,m,11.55,True
c,Julia,15,f,18.61,False
e,Caio,17,f,10.73,False


In [102]:
#dataframe column creation
df_series['Minor'] = df_series['Age'] < 18
df_series['Age plus one'] = df_series['Age'] + 1
df_series['Age times number'] = df_series['Age'] * df_series['Number']

#editing columns
df_series.loc[df_series['Names'] == 'Clara', ['Gender']] = 'f'
df_series.loc[df_series['Names'].isin(['Caio', 'Pedro']), ['Gender']] = 'm'

#deleting columns two ways
df_series.drop(columns=['Age plus one', 'Minor'], inplace=True) #must use inplace to delete from dataframe
del df_series['Age times number']
df_series.head()

#lets make a column to check if each person is a female and above 16
df_series['Check'] = (df_series['Age'] > 16) & (df_series['Gender'] == 'f')
df_series = df_series.astype({'Check' : 'string'})
df_series["Check"] = df_series["Check"].replace({"True" : "OK", "False" : "N"})
del df_series['Check'] #undo

#lets make the same thing but using the apply method and a function instead
def checker(age, gender):
    return "OK" if ((age > 16) and (gender == 'f')) else "N"

df_series['Check'] = df_series.apply(func= lambda row: checker(row['Age'], row['Gender']), axis=1) #axis = 1 means we are feeding the funcion every row. axis = 0 means every column
df_series.head()

Unnamed: 0,Names,Age,Gender,Number,Condition,Check
a,Ana,17,f,18.87,False,OK
b,Clara,20,f,11.55,True,OK
c,Julia,15,f,18.61,False,N
d,Pedro,18,m,12.48,True,N
e,Caio,17,m,10.73,False,N
