# Pandas

In [7]:
import pandas as pd
import numpy as np
print('Pandas version:', pd.__version__)

Pandas version: 2.3.3


## 1. Series: One-Dimensional data

### 1.1 Creating series

In [3]:
temp = pd.Series([22,25,23,28,24])
temp

0    22
1    25
2    23
3    28
4    24
dtype: int64

In [4]:
type(temp)

pandas.core.series.Series

In [3]:
day = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri']
temp = pd.Series([22, 25, 23, 28, 24], index=day)

temp

Mon      22
Tues     25
Wed      23
Thurs    28
Fri      24
dtype: int64

In [4]:
temp.iloc[2]

np.int64(23)

In [5]:
temp['Wed']

np.int64(23)

In [6]:
temp.mean()

np.float64(24.4)

In [7]:
temp.max()

np.int64(28)

## 2. DataFrame: two-dimensional data

In [3]:
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 20, 35, 28],
    'city': ['New York', 'Paris', 'London', 'Tokyo'],
    'salary': [50000, 60000, 55000, 58000]
}

type(data)

dict

In [4]:
data

{'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
 'age': [25, 20, 35, 28],
 'city': ['New York', 'Paris', 'London', 'Tokyo'],
 'salary': [50000, 60000, 55000, 58000]}

In [5]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,20,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [7]:
index = ['first', 'second', 'third', 'fourth']
df.set_index(pd.Index(index), inplace=True)

df

Unnamed: 0,name,age,city,salary
first,Alice,25,New York,50000
second,Bob,20,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [9]:
df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,20,Paris,60000
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [11]:
df.drop(['salary'], axis=1)

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,20,Paris
2,Charlie,35,London
3,Diana,28,Tokyo


In [14]:
df.head(2)

Unnamed: 0,name,age,city,salary
0,Alice,25,New York,50000
1,Bob,20,Paris,60000


In [16]:
df.tail(2)

Unnamed: 0,name,age,city,salary
2,Charlie,35,London,55000
3,Diana,28,Tokyo,58000


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   age     4 non-null      int64 
 2   city    4 non-null      object
 3   salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


In [13]:
df.describe()

Unnamed: 0,age,salary
count,4.0,4.0
mean,27.0,55750.0
std,6.271629,4349.32945
min,20.0,50000.0
25%,23.75,53750.0
50%,26.5,56500.0
75%,29.75,58500.0
max,35.0,60000.0


In [17]:
df['name']

0      Alice
1        Bob
2    Charlie
3      Diana
Name: name, dtype: object

In [18]:
df[['name', 'city']]

Unnamed: 0,name,city
0,Alice,New York
1,Bob,Paris
2,Charlie,London
3,Diana,Tokyo


In [19]:
index = ['first', 'second', 'third', 'fourth']
df = df.set_index(pd.Index(index))

df

Unnamed: 0,name,age,city,salary
first,Alice,25,New York,50000
second,Bob,20,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [20]:
df.loc['first']

name         Alice
age             25
city      New York
salary       50000
Name: first, dtype: object

In [21]:
df.loc[['first', 'second']]

Unnamed: 0,name,age,city,salary
first,Alice,25,New York,50000
second,Bob,20,Paris,60000


In [22]:
df.loc[['first', 'second'], ['name', 'salary']]

Unnamed: 0,name,salary
first,Alice,50000
second,Bob,60000


In [23]:
df[df['salary'] > 50000]

Unnamed: 0,name,age,city,salary
second,Bob,20,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [24]:
df[df['city'] == 'Tokyo']

Unnamed: 0,name,age,city,salary
fourth,Diana,28,Tokyo,58000


In [25]:
df['salary'] > 50000

first     False
second     True
third      True
fourth     True
Name: salary, dtype: bool

In [29]:
df[(df['salary'] > 50000) & (df['city'] == 'Tokyo')]

Unnamed: 0,name,age,city,salary
fourth,Diana,28,Tokyo,58000


In [30]:
df.columns

Index(['name', 'age', 'city', 'salary'], dtype='object')

In [32]:
df = df.rename(columns = {'city': 'location'})

df

Unnamed: 0,name,age,location,salary
first,Alice,25,New York,50000
second,Bob,20,Paris,60000
third,Charlie,35,London,55000
fourth,Diana,28,Tokyo,58000


In [13]:
missing_data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, np.nan, 35, 28],
    'city': ['New York', 'Paris', 'London', 'Tokyo'],
    'salary': [50000, 60000, np.nan, 58000]
}

data_with_missing = pd.DataFrame(missing_data)
data_with_missing

Unnamed: 0,name,age,city,salary
0,Alice,25.0,New York,50000.0
1,Bob,,Paris,60000.0
2,Charlie,35.0,London,
3,Diana,28.0,Tokyo,58000.0


In [9]:
data_with_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    4 non-null      object 
 1   age     3 non-null      float64
 2   city    4 non-null      object 
 3   salary  3 non-null      float64
dtypes: float64(2), object(2)
memory usage: 260.0+ bytes


In [10]:
data_with_missing.isnull()

Unnamed: 0,name,age,city,salary
0,False,False,False,False
1,False,True,False,False
2,False,False,False,True
3,False,False,False,False


In [11]:
data_with_missing.isnull().sum()

name      0
age       1
city      0
salary    1
dtype: int64

In [12]:
data_with_missing['age'].fillna(data_with_missing['age'].mean())

0    25.000000
1    29.333333
2    35.000000
3    28.000000
Name: age, dtype: float64

### 4. Working with data types

In [17]:
mixed_data = pd.DataFrame({
    'numbers_as_text':[1, 2, 3, 4.0, 5.0],
    'prices':['10.60', '20.75','15.25', '30.00', '25.50'],
    'categories': ['A', 'B', 'A', 'C', 'B'],
    'is_active': ['True', 'False', 'True', 'True', 'False'],
    'dates_as_text': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']
})

mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numbers_as_text  5 non-null      float64
 1   prices           5 non-null      object 
 2   categories       5 non-null      object 
 3   is_active        5 non-null      object 
 4   dates_as_text    5 non-null      object 
dtypes: float64(1), object(4)
memory usage: 332.0+ bytes


In [20]:
mixed_data['prices'] = mixed_data['prices'].astype(float)
mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numbers_as_text  5 non-null      float64
 1   prices           5 non-null      float64
 2   categories       5 non-null      object 
 3   is_active        5 non-null      object 
 4   dates_as_text    5 non-null      object 
dtypes: float64(2), object(3)
memory usage: 332.0+ bytes


In [21]:
mixed_data['prices'] = mixed_data['prices'].astype(np.float32)
mixed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   numbers_as_text  5 non-null      float64
 1   prices           5 non-null      float32
 2   categories       5 non-null      object 
 3   is_active        5 non-null      object 
 4   dates_as_text    5 non-null      object 
dtypes: float32(1), float64(1), object(3)
memory usage: 312.0+ bytes
