pandas series

In [1]:
import pandas as pd

In [2]:
# creating a series from list
# observe that s list of values is provided 
myseries = pd.Series([10, 20, 30])

print(myseries)

0    10
1    20
2    30
dtype: int64


In [3]:
# Creating a custom-index Series
myseries = pd.Series(
     [10,20,30], 
     index = ["a","b","c"]
)

print(myseries)
print(myseries['a'])

a    10
b    20
c    30
dtype: int64
10


In [4]:
# Accessing an item in a Series
myseries = pd.Series(
   ["Jane","John","Emily","Matt"]
)

# Print the first item
print(myseries[0])

Jane


In [5]:
# Checking if all the values are unique  ==> basically know whether there is some duplicate in series
# is_unique method on the series
myseries = pd.Series([1,2,3])
print(myseries.is_unique)

myseries = pd.Series([1,1,3])
print(myseries.is_unique)

True
False


pandas dataframe

In [6]:
df = pd.DataFrame({
    "Name": ["Jane", "John", "Matt", "Ashley"],
    "Age": [24, 21, 26, 32]
})

print(df)

     Name  Age
0    Jane   24
1    John   21
2    Matt   26
3  Ashley   32


In [7]:
# Dictionary keys become the column names and the values become the data stored in the DataFrame. 
# We now have a DataFrame with two columns and four rows. 
# We can check the dimensions of a DataFrame using the shape method that returns a tuple containing the number of rows and columns.
print(df.shape)

(4, 2)


Creating a dafarame from different files has methods like read_sql, read_csv, read_json, read_parquet, read_excel

In [8]:
sales_df = pd.read_csv('sales.csv')
# now the data might be very large and I want to see only top 5 rows I can use head function
# also notice that head() is the method 
# whereas shape is a property that means pandas class must have a attribute of shape
print(sales_df.head())
print(sales_df.shape)

   product_code product_group  stock_qty    cost    price  last_week_sales  \
0          4187           PG2        498  420.76   569.91               13   
1          4195           PG2        473  545.64   712.41               16   
2          4204           PG2        968  640.42   854.91               22   
3          4219           PG2        241  869.69  1034.55               14   
4          4718           PG2       1401   12.54    26.59               50   

   last_month_sales  
0                58  
1                58  
2                88  
3                45  
4               285  
(1000, 7)


In [9]:
# the file might have a lot of columns but i just want to read a few of them what to do???
# usecols argument
sales_col_df = pd.read_csv("sales.csv", usecols=["product_code","product_group","stock_qty"])
print(sales_col_df.head())
print(sales_col_df.shape)

   product_code product_group  stock_qty
0          4187           PG2        498
1          4195           PG2        473
2          4204           PG2        968
3          4219           PG2        241
4          4718           PG2       1401
(1000, 3)


In [10]:
# now you see i restricted the columns I want to load in df, what about rows
# can i somehow retrict the number of rows I am reading??
# nrows
sales_rows_cols_df = pd.read_csv("sales.csv", usecols=["product_code","product_group","stock_qty"], nrows=10)
print(sales_rows_cols_df.head())
print(sales_rows_cols_df.shape)

   product_code product_group  stock_qty
0          4187           PG2        498
1          4195           PG2        473
2          4204           PG2        968
3          4219           PG2        241
4          4718           PG2       1401
(10, 3)


Create a dataframe python dictionary or 2-d array 

In [11]:
# pd.DataFrame construcotr to create using dictionary
df = pd.DataFrame({
  "Names": ["Jane", "John", "Matt", "Ashley"],
  "Ages": [26, 24, 28, 25],
  "Score": [91.2, 94.1, 89.5, 92.3]
})

print(df)

    Names  Ages  Score
0    Jane    26   91.2
1    John    24   94.1
2    Matt    28   89.5
3  Ashley    25   92.3


In [12]:
import numpy as np
arr = np.random.randint(10, 20, size=(3,5))
df = pd.DataFrame(arr, columns=["A","B","C","D","E"])
print(arr)
print(df)


[[17 13 12 14 13]
 [10 17 17 13 19]
 [10 12 11 10 18]]
    A   B   C   D   E
0  17  13  12  14  13
1  10  17  17  13  19
2  10  12  11  10  18


##  Exploring a dataframe

#### 1. size of a dataframe (The size, shape, and len methods) # see all these are attributes

In [13]:
sales = pd.read_csv("sales.csv")
# shape gives a tuple telling rows and columns
print("shape:",sales.shape)
# size gives the number of columns * number of roes
print("size:",sales.size)
# len gives number of rows
print("len:",len(sales))

shape: (1000, 7)
size: 7000
len: 1000


#### 2. Data Types of Columns

In [14]:
# Data structures of different data types take up a different amount of memory space. 
# Having proper data types saves us from wasting memory.

# Some methods and functions can also be used with certain data types. 
# For instance, we need to store data that contains date and time in the data structures of datetime data type to use it.
### dtypes property to see the data types of the df
sales = pd.read_csv("sales.csv")

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty             int64
cost                float64
price               float64
last_week_sales       int64
last_month_sales      int64
dtype: object


In [15]:
 # Column property returns the columns as an index, 
 # but we can convert it to a list with the help of the list function.
 
sales = pd.read_csv("sales.csv")

print("As index:")
print(sales.columns)

print("As list:")
print(list(sales.columns))

As index:
Index(['product_code', 'product_group', 'stock_qty', 'cost', 'price',
       'last_week_sales', 'last_month_sales'],
      dtype='object')
As list:
['product_code', 'product_group', 'stock_qty', 'cost', 'price', 'last_week_sales', 'last_month_sales']


In [16]:
# The data type of the stock quantity column is an integer. 
# Suppose we have some products whose stock amount can be a decimal point number. 
# For instance, we might have 125.2 kg of rice.

# We can use the astpye function to change the data types of columns.
### astype mthod
sales = pd.read_csv("sales.csv")

sales["stock_qty"] = sales["stock_qty"].astype("float")

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty           float64
cost                float64
price               float64
last_week_sales       int64
last_month_sales      int64
dtype: object


In [17]:
# The astype function also accepts a dictionary, so we can change the data type of multiple columns in a single operation. 
# The dictionary keys indicate that the column name and values are the new data types.

sales = pd.read_csv("sales.csv")

sales = sales.astype({
  "stock_qty": "float",
  "last_week_sales": "float"
})

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty           float64
cost                float64
price               float64
last_week_sales     float64
last_month_sales      int64
dtype: object


In [18]:
# Using the unique and nunique functions
# The nunique function returns the number of distinct values in a column 
# and the unique function actually shows the unique values.
sales = pd.read_csv("sales.csv")
print(sales["product_group"])
print(sales["product_group"].nunique())
print(sales["product_group"].unique())

0      PG2
1      PG2
2      PG2
3      PG2
4      PG2
      ... 
995    PG4
996    PG4
997    PG2
998    PG2
999    PG5
Name: product_group, Length: 1000, dtype: object
6
['PG2' 'PG4' 'PG6' 'PG5' 'PG3' 'PG1']


In [19]:
# unique is giving me unique values and nunique gives me count of unique. 
# I need to get which unique comes how many times. ==> use value_counts
print(sales["product_group"].value_counts())

PG4    349
PG5    255
PG6    243
PG2     75
PG3     39
PG1     39
Name: product_group, dtype: int64


##MEAN, MODE, MEDIAN (CENTRAL TENDENCY)

In [20]:
# median on series
myseries = pd.Series([1, 2, 5, 7, 11, 36])
print(myseries.median())

6.0


In [21]:
# mode on a series (mode is returning series of number which are the mode in series eg. 6 and 11 here)
myseries = pd.Series([1, 4, 6, 6, 6, 11, 11,11, 24])
print(f"The mode of my series is {myseries.mode()[1]}")

The mode of my series is 11


In [22]:
# min, max and mean, mode and median
print("mean: ")
print(sales["price"].mean())

print("median: ")
print(sales["price"].median())

print("mode: ")
print(sales["price"].mode()[0])

print("minimum: ")
print(sales["price"].min())

print("maximum: ")
print(sales["price"].max())

mean: 
67.06351000000001
median: 
23.74
mode: 
10.44
minimum: 
0.66
maximum: 
1500.05


In [23]:
# variance and standard deviation 
print("variance: ")
print(sales["price"].var())

print("standard deviation: ")
print(sales["price"].std())

variance: 
20766.243824604506
standard deviation: 
144.10497501684148


In [24]:
sales["product_code"].value_counts().index[:3].to_list()

[5694, 2591, 2645]

In [25]:
x = sales["product_code"].value_counts().index.tolist()
x[:3]

[5694, 2591, 2645]

## Filtering the dataframe

In [26]:
# The main difference between them is the way they access rows and columns:

# loc uses row and column labels.
# iloc uses row and column indexes.

sales = pd.read_csv("sales.csv")
sales.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [27]:
sales[["product_group","product_code","stock_qty"]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241
4,PG2,4718,1401
...,...,...,...
995,PG4,8048,415
996,PG4,8050,-10
997,PG2,952,5388
998,PG2,1307,44996


In [28]:
sales[:5]

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [29]:
sales.loc[:4]

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [30]:
# loc is looking at the column name 
sales.loc[:4,["product_group","product_code","stock_qty"]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241
4,PG2,4718,1401


In [31]:
# above thing with iloc only used the indexes
sales.iloc[:4,[1,0,2]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241


In [32]:
# Get rows from row. no 6 to 9 and initial 2 columns
print(sales.iloc[[5,6,7,8], [0,1]])

   product_code product_group
5          5630           PG4
6          5631           PG4
7          5634           PG4
8          2650           PG4


In [33]:
print(sales.iloc[5:9, :2])

   product_code product_group
5          5630           PG4
6          5631           PG4
7          5634           PG4
8          2650           PG4


Pandas assigns integer labels to rows by default. Unless we specify otherwise, row indexes and labels will be the same.

In [34]:
df = pd.DataFrame(
  np.random.randint(10, size=(4,4)),
  index = ["a","b","c","d"],
  columns = ["col_a","col_b","col_c","col_d"]
  )

print(df)

print("\nSelect two rows and two columns using loc:")
print(df.loc[["b","d"], ["col_a","col_c"]])

   col_a  col_b  col_c  col_d
a      2      9      6      8
b      4      8      2      1
c      2      9      6      0
d      9      6      3      3

Select two rows and two columns using loc:
   col_a  col_c
b      4      2
d      9      3


In [35]:
print("\nSelect two rows and two columns using loc:")
print(df.iloc[[1,3], [0,2]])


Select two rows and two columns using loc:
   col_a  col_c
b      4      2
d      9      3


# Selecting a subset of columns

In [36]:
sales = pd.read_csv("sales.csv")

selected_columns = ["product_code","price"]

print(sales[selected_columns].head())
# or
print(sales[["product_code","price"]].head())

   product_code    price
0          4187   569.91
1          4195   712.41
2          4204   854.91
3          4219  1034.55
4          4718    26.59
   product_code    price
0          4187   569.91
1          4195   712.41
2          4204   854.91
3          4219  1034.55
4          4718    26.59


In [37]:
#pandas expect list otherwise error, see here
print(sales["product_code","price"].head())

KeyError: ('product_code', 'price')

In [None]:
# Even if we want to select only one column, we need to put it in a list.
# Otherwise, Pandas will return a Series instead of a DataFrame with one column.
print(sales["product_code"][:2],type(sales["product_code"]))
print(sales[["product_code"]][:2],type(sales[["product_code"]]))

0    4187
1    4195
Name: product_code, dtype: int64 <class 'pandas.core.series.Series'>
   product_code
0          4187
1          4195 <class 'pandas.core.frame.DataFrame'>


##Filtering by Condition

In [38]:
# The following line of code selects the products that belong to product group PG2.
sales_filtered = sales[sales.product_group == "PG2"]
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [39]:
sales_PG1_filtered = sales[sales["product_group"] == "PG2"]
sales_PG1_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [40]:
# We can use any of the options above, unless there’s a space in the column name.
# In such cases, the first option won’t work.

In [41]:
sales_numeric_filtered = sales[sales["price"] > 100]
sales_numeric_filtered.head()
# The operators we can use to create conditions are:

# ==: equal
# !=: not equal
# >: greater than
# >=: greater than or equal to
# <: less than
# <=: less than or equal to

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
8,2650,PG4,239,59.4,111.06,15,38


In [42]:
# Multiple conditions
# filter the sales data frame # logical and here
sales_filtered = sales[(sales["price"] > 100) & (sales["stock_qty"] < 400)]

print(sales_filtered[["price","stock_qty"]].head())

       price  stock_qty
3    1034.55        241
8     111.06        239
165   208.91        244
186   427.41        369
199   104.49        144


When combining multiple conditions, make sure to put each filter inside parentheses. 
Otherwise, a value error will be generated.

In [43]:
# The | operator is used to combine multiple conditions with OR logic.
sales_filtered = sales[(sales["product_group"] == "PG1") | (sales["product_group"] == "PG2")]
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [44]:
# The isin method
# There’s a more practical option, which is the isin method. It accepts a list of values used for filtering.
sales_filtered = sales[sales["product_group"].isin(["PG1","PG2","PG3"])]
print(sales_filtered[["product_code","product_group"]].head())

   product_code product_group
0          4187           PG2
1          4195           PG2
2          4204           PG2
3          4219           PG2
4          4718           PG2


In [45]:
# Finally, we have the not operator (~). It’s used before the name of the DataFrame inside the square brackets.
# We can select the products that aren’t in product groups PG1, PG2, or PG3 as follows:

sales_filtered = sales[~sales["product_group"].isin(["PG1","PG2","PG3"])]
print(sales_filtered[["product_group"]].head())
print(sales_filtered["product_group"].value_counts())

  product_group
5           PG4
6           PG4
7           PG4
8           PG4
9           PG4
PG4    349
PG5    255
PG6    243
Name: product_group, dtype: int64


#The query function

In [46]:
# It differs from previous functions because it can write the conditions as text. 
# It’s quite useful and is more practical in various cases.

sales_filtered = sales.query("price > 100")
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
8,2650,PG4,239,59.4,111.06,15,38


In [47]:
# multiple conditions with query
sales_filtered = sales.query("price > 100 and stock_qty < 400")

print(sales_filtered[["product_code","price","stock_qty"]].head())

     product_code    price  stock_qty
3            4219  1034.55        241
8            2650   111.06        239
165          1657   208.91        244
186          7269   427.41        369
199          3530   104.49        144


In [48]:
# be extra careful if writing another string
sales_filtered = sales.query("product_group == 'PG2'")
print(sales_filtered.head())

   product_code product_group  stock_qty    cost    price  last_week_sales  \
0          4187           PG2        498  420.76   569.91               13   
1          4195           PG2        473  545.64   712.41               16   
2          4204           PG2        968  640.42   854.91               22   
3          4219           PG2        241  869.69  1034.55               14   
4          4718           PG2       1401   12.54    26.59               50   

   last_month_sales  
0                58  
1                58  
2                88  
3                45  
4               285  


Slicing and Indexing on Strings

In [67]:
staff = pd.read_csv("staff.csv")

In [52]:
print(staff.head())
print(staff.shape)
print(staff.columns)

               name             city date_of_birth  start_date   salary  \
0          John Doe      Houston, TX    1998-11-04  2018-08-11  $65,000   
1          Jane Doe     San Jose, CA    1995-08-05  2017-08-24  $70,000   
2        Matt smith       Dallas, TX    1996-11-25  2020-04-16  $58,500   
3     Ashley Harris        Miami, FL    1995-01-08  2021-02-11  $49,500   
4  Jonathan targett  Santa Clara, CA    1998-08-14  2020-09-01  $62,000   

        department  
0       Accounting  
1    Field Quality  
2  human resources  
3       accounting  
4    field quality  
(6, 6)
Index(['name', 'city', 'date_of_birth', 'start_date', 'salary', 'department'], dtype='object')


Selecting the first character of strings

In [53]:
# A string is a sequence of characters, so each character has an associated index.
# The indexes of characters can be used to select an individual character or a slice from a string.
# For instance, we can get the first letter of the strings in the name column as below:
print(staff["name"].str[0])

0    J
1    J
2    M
3    A
4    J
5    H
Name: name, dtype: object


In [58]:
# slicing the initial 3 letters (notice here that 3 is out)
print(staff["name"].str[0:3])
# or
print(staff["name"].str[:3])

0    Joh
1    Jan
2    Mat
3    Ash
4    Jon
5    Hal
Name: name, dtype: object
0    Joh
1    Jan
2    Mat
3    Ash
4    Jon
5    Hal
Name: name, dtype: object


In [59]:
print(staff["name"].str[-2:])

0    oe
1    oe
2    th
3    is
4    tt
5    le
Name: name, dtype: object


In [61]:
# For instance, we can create a slice that involves every other character, starting from the second-to-last index.
# str[start : end : step size]
print(staff["name"].str[1::2])

0        onDe
1        aeDe
2       atsih
3      slyHri
4    oahntret
5        aeCl
Name: name, dtype: object


Splitting and Combining Strings

In [62]:
print(staff)

               name             city date_of_birth  start_date   salary  \
0          John Doe      Houston, TX    1998-11-04  2018-08-11  $65,000   
1          Jane Doe     San Jose, CA    1995-08-05  2017-08-24  $70,000   
2        Matt smith       Dallas, TX    1996-11-25  2020-04-16  $58,500   
3     Ashley Harris        Miami, FL    1995-01-08  2021-02-11  $49,500   
4  Jonathan targett  Santa Clara, CA    1998-08-14  2020-09-01  $62,000   
5         Hale Cole      Atlanta, GA    2000-10-24  2021-10-20  $54,500   

        department  
0       Accounting  
1    Field Quality  
2  human resources  
3       accounting  
4    field quality  
5      engineering  


In [63]:
# The Pandas split function is available under the str accessor. 
# It splits a string at the position of the given character and then returns a list of all parts.
print(staff["name"].str.split(" "))

0            [John, Doe]
1            [Jane, Doe]
2          [Matt, smith]
3       [Ashley, Harris]
4    [Jonathan, targett]
5           [Hale, Cole]
Name: name, dtype: object


In [84]:
# It’s not enough to merely split a string. We also need to extract the part we need.
# The expand parameter of the split function can be used to create separate columns after splitting. 
# We can then select the column we need.
from copy import deepcopy
staff_exp = deepcopy(staff)
staff_exp["last_name"] = staff_exp["name"].str.split(" ", expand=True)[1]
print(staff_exp)

               name             city date_of_birth  start_date   salary  \
0          John Doe      Houston, TX    1998-11-04  2018-08-11  $65,000   
1          Jane Doe     San Jose, CA    1995-08-05  2017-08-24  $70,000   
2        Matt smith       Dallas, TX    1996-11-25  2020-04-16  $58,500   
3     Ashley Harris        Miami, FL    1995-01-08  2021-02-11  $49,500   
4  Jonathan targett  Santa Clara, CA    1998-08-14  2020-09-01  $62,000   
5         Hale Cole      Atlanta, GA    2000-10-24  2021-10-20  $54,500   

        department        name_lower last_name  
0       Accounting          john doe       Doe  
1    Field Quality          jane doe       Doe  
2  human resources        matt smith     smith  
3       accounting     ashley harris    Harris  
4    field quality  jonathan targett   targett  
5      engineering         hale cole      Cole  


In [85]:
# Combining
print(staff["name"] + " - " + staff["department"])

0               John Doe - Accounting
1            Jane Doe - Field Quality
2        Matt smith - human resources
3          Ashley Harris - accounting
4    Jonathan targett - field quality
5             Hale Cole - engineering
dtype: object


Converting Strings to Upper and Lower Case

In [86]:
print("--b4 applying lower--")
print(staff_exp[["name"]])
staff_exp["name_lower"] = staff_exp["name"].str.lower()
staff_exp["name_upper"] = staff_exp["name"].str.upper()
staff_exp["dept_capitalize"] = staff_exp["department"].str.capitalize()
print("--after applying lower--")
print(staff_exp[["name","name_lower","name_upper","dept_capitalize"]])

--b4 applying lower--
               name
0          John Doe
1          Jane Doe
2        Matt smith
3     Ashley Harris
4  Jonathan targett
5         Hale Cole
--after applying lower--
               name        name_lower        name_upper  dept_capitalize
0          John Doe          john doe          JOHN DOE       Accounting
1          Jane Doe          jane doe          JANE DOE    Field quality
2        Matt smith        matt smith        MATT SMITH  Human resources
3     Ashley Harris     ashley harris     ASHLEY HARRIS       Accounting
4  Jonathan targett  jonathan targett  JONATHAN TARGETT    Field quality
5         Hale Cole         hale cole         HALE COLE      Engineering


In [87]:
# applying lower or upper on single string
print(staff_exp["department"][0].upper())

ACCOUNTING


When we work on tabular data (data in tables), 
it’s much more efficient to use the string methods under the str accessor. 
They allow us to perform operations on the entire column. 
Make sure to write str` before the name of the method.

In [88]:
# Replacing Characters in a String
staff = pd.read_csv("staff.csv")
print(staff["city"])
print(staff["city"].str.replace(",", "-"))
print(staff["city"])

0        Houston, TX
1       San Jose, CA
2         Dallas, TX
3          Miami, FL
4    Santa Clara, CA
5        Atlanta, GA
Name: city, dtype: object
0        Houston- TX
1       San Jose- CA
2         Dallas- TX
3          Miami- FL
4    Santa Clara- CA
5        Atlanta- GA
Name: city, dtype: object
0        Houston, TX
1       San Jose, CA
2         Dallas, TX
3          Miami, FL
4    Santa Clara, CA
5        Atlanta, GA
Name: city, dtype: object


In [90]:
# previously we replaced one column values, what if I want to replace it in entire df
# answer is direct replace method
staff = pd.read_csv("staff.csv")

# Create a state colum
staff["state"] = staff["city"].str[-2:]
print(staff["state"])
# Replace state abbreviations with actual state names
staff["state"].replace(
    {"TX": "Texas", "CA": "California", "FL": "Florida", "GA": "Georgia"},
    inplace = True
)

print(staff["state"])

0    TX
1    CA
2    TX
3    FL
4    CA
5    GA
Name: state, dtype: object
0         Texas
1    California
2         Texas
3       Florida
4    California
5       Georgia
Name: state, dtype: object


The inplace parameter is set to True to save changes in the DataFrame.

It’s important to emphasize the difference between str.replace and DataFrame.replace:

str.replace can be used to replace a part of a string. We can replace one character, multiple characters, or the entire string.
DataFrame.replace can be used to replace the entire value. We can also use this function to replace values with other data types such as integer and boolean.

In [91]:
# Combining multiple operations
# we can extract the state part from the city column and convert it to lowercase letters in a single line of code.
print(staff["city"].str.split(",", expand=True)[1].str.lower())
# Consider a case where we need to change the name of the “field quality” department to “quality.”
# In the department column of the staff, there are both lower and upper case letters.
# We first need to convert them to either lower or upper case and then do the replacement.
print(staff["department"].str.lower().replace("field quality","quality"))

0     tx
1     ca
2     tx
3     fl
4     ca
5     ga
Name: 1, dtype: object
0         accounting
1            quality
2    human resources
3         accounting
4            quality
5        engineering
Name: department, dtype: object


In [92]:
# filtering operation with the query function, extracts the year from the start_date column,
# and changes its data type to integer.
print(staff.query("name > 'John Doe'").start_date.str[:4].astype("int"))

2    2020
4    2020
Name: start_date, dtype: int32


In [98]:
staff["salary_cleaned"] = staff["salary"].str[1:].str.replace(",","")
staff["salary_cleaned"] = staff["salary_cleaned"].astype("int")
# return list(staff["salary_cleaned"])
staff["salary_cleaned"]

0    65000
1    70000
2    58500
3    49500
4    62000
5    54500
Name: salary_cleaned, dtype: int32