pandas series

In [2]:
import pandas as pd

In [3]:
# creating a series from list
# observe that s list of values is provided 
myseries = pd.Series([10, 20, 30])

print(myseries)

0    10
1    20
2    30
dtype: int64


In [4]:
# Creating a custom-index Series
myseries = pd.Series(
     [10,20,30], 
     index = ["a","b","c"]
)

print(myseries)
print(myseries['a'])

a    10
b    20
c    30
dtype: int64
10


In [5]:
# Accessing an item in a Series
myseries = pd.Series(
   ["Jane","John","Emily","Matt"]
)

# Print the first item
print(myseries[0])

Jane


In [6]:
# Checking if all the values are unique  ==> basically know whether there is some duplicate in series
# is_unique method on the series
myseries = pd.Series([1,2,3])
print(myseries.is_unique)

myseries = pd.Series([1,1,3])
print(myseries.is_unique)

True
False


pandas dataframe

In [7]:
df = pd.DataFrame({
    "Name": ["Jane", "John", "Matt", "Ashley"],
    "Age": [24, 21, 26, 32]
})

print(df)

     Name  Age
0    Jane   24
1    John   21
2    Matt   26
3  Ashley   32


In [8]:
# Dictionary keys become the column names and the values become the data stored in the DataFrame. 
# We now have a DataFrame with two columns and four rows. 
# We can check the dimensions of a DataFrame using the shape method that returns a tuple containing the number of rows and columns.
print(df.shape)

(4, 2)


Creating a dafarame from different files has methods like read_sql, read_csv, read_json, read_parquet, read_excel

In [9]:
sales_df = pd.read_csv('sales.csv')
# now the data might be very large and I want to see only top 5 rows I can use head function
# also notice that head() is the method 
# whereas shape is a property that means pandas class must have a attribute of shape
print(sales_df.head())
print(sales_df.shape)

   product_code product_group  stock_qty    cost    price  last_week_sales  \
0          4187           PG2        498  420.76   569.91               13   
1          4195           PG2        473  545.64   712.41               16   
2          4204           PG2        968  640.42   854.91               22   
3          4219           PG2        241  869.69  1034.55               14   
4          4718           PG2       1401   12.54    26.59               50   

   last_month_sales  
0                58  
1                58  
2                88  
3                45  
4               285  
(1000, 7)


In [10]:
# the file might have a lot of columns but i just want to read a few of them what to do???
# usecols argument
sales_col_df = pd.read_csv("sales.csv", usecols=["product_code","product_group","stock_qty"])
print(sales_col_df.head())
print(sales_col_df.shape)

   product_code product_group  stock_qty
0          4187           PG2        498
1          4195           PG2        473
2          4204           PG2        968
3          4219           PG2        241
4          4718           PG2       1401
(1000, 3)


In [21]:
# now you see i restricted the columns I want to load in df, what about rows
# can i somehow retrict the number of rows I am reading??
# nrows
sales_rows_cols_df = pd.read_csv("sales.csv", usecols=["product_code","product_group","stock_qty"], nrows=10)
print(sales_rows_cols_df.head())
print(sales_rows_cols_df.shape)

   product_code product_group  stock_qty
0          4187           PG2        498
1          4195           PG2        473
2          4204           PG2        968
3          4219           PG2        241
4          4718           PG2       1401
(10, 3)


Create a dataframe python dictionary or 2-d array 

In [11]:
# pd.DataFrame construcotr to create using dictionary
df = pd.DataFrame({
  "Names": ["Jane", "John", "Matt", "Ashley"],
  "Ages": [26, 24, 28, 25],
  "Score": [91.2, 94.1, 89.5, 92.3]
})

print(df)

    Names  Ages  Score
0    Jane    26   91.2
1    John    24   94.1
2    Matt    28   89.5
3  Ashley    25   92.3


In [12]:
import numpy as np
arr = np.random.randint(10, 20, size=(3,5))
df = pd.DataFrame(arr, columns=["A","B","C","D","E"])
print(arr)
print(df)


[[18 17 12 17 18]
 [18 19 19 13 11]
 [16 14 18 14 14]]
    A   B   C   D   E
0  18  17  12  17  18
1  18  19  19  13  11
2  16  14  18  14  14


##  Exploring a dataframe

#### 1. size of a dataframe (The size, shape, and len methods) # see all these are attributes

In [13]:
sales = pd.read_csv("sales.csv")
# shape gives a tuple telling rows and columns
print("shape:",sales.shape)
# size gives the number of columns * number of roes
print("size:",sales.size)
# len gives number of rows
print("len:",len(sales))

shape: (1000, 7)
size: 7000
len: 1000


#### 2. Data Types of Columns

In [14]:
# Data structures of different data types take up a different amount of memory space. 
# Having proper data types saves us from wasting memory.

# Some methods and functions can also be used with certain data types. 
# For instance, we need to store data that contains date and time in the data structures of datetime data type to use it.
### dtypes property to see the data types of the df
sales = pd.read_csv("sales.csv")

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty             int64
cost                float64
price               float64
last_week_sales       int64
last_month_sales      int64
dtype: object


In [15]:
 # Column property returns the columns as an index, 
 # but we can convert it to a list with the help of the list function.
 
sales = pd.read_csv("sales.csv")

print("As index:")
print(sales.columns)

print("As list:")
print(list(sales.columns))

As index:
Index(['product_code', 'product_group', 'stock_qty', 'cost', 'price',
       'last_week_sales', 'last_month_sales'],
      dtype='object')
As list:
['product_code', 'product_group', 'stock_qty', 'cost', 'price', 'last_week_sales', 'last_month_sales']


In [16]:
# The data type of the stock quantity column is an integer. 
# Suppose we have some products whose stock amount can be a decimal point number. 
# For instance, we might have 125.2 kg of rice.

# We can use the astpye function to change the data types of columns.
### astype mthod
sales = pd.read_csv("sales.csv")

sales["stock_qty"] = sales["stock_qty"].astype("float")

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty           float64
cost                float64
price               float64
last_week_sales       int64
last_month_sales      int64
dtype: object


In [17]:
# The astype function also accepts a dictionary, so we can change the data type of multiple columns in a single operation. 
# The dictionary keys indicate that the column name and values are the new data types.

sales = pd.read_csv("sales.csv")

sales = sales.astype({
  "stock_qty": "float",
  "last_week_sales": "float"
})

print(sales.dtypes)

product_code          int64
product_group        object
stock_qty           float64
cost                float64
price               float64
last_week_sales     float64
last_month_sales      int64
dtype: object


In [18]:
# Using the unique and nunique functions
# The nunique function returns the number of distinct values in a column 
# and the unique function actually shows the unique values.
sales = pd.read_csv("sales.csv")
print(sales["product_group"])
print(sales["product_group"].nunique())
print(sales["product_group"].unique())

0      PG2
1      PG2
2      PG2
3      PG2
4      PG2
      ... 
995    PG4
996    PG4
997    PG2
998    PG2
999    PG5
Name: product_group, Length: 1000, dtype: object
6
['PG2' 'PG4' 'PG6' 'PG5' 'PG3' 'PG1']


In [20]:
# unique is giving me unique values and nunique gives me count of unique. 
# I need to get which unique comes how many times. ==> use value_counts
print(sales["product_group"].value_counts())

PG4    349
PG5    255
PG6    243
PG2     75
PG3     39
PG1     39
Name: product_group, dtype: int64


##MEAN, MODE, MEDIAN (CENTRAL TENDENCY)

In [22]:
# median on series
myseries = pd.Series([1, 2, 5, 7, 11, 36])
print(myseries.median())

6.0


In [30]:
# mode on a series (mode is returning series of number which are the mode in series eg. 6 and 11 here)
myseries = pd.Series([1, 4, 6, 6, 6, 11, 11,11, 24])
print(f"The mode of my series is {myseries.mode()[1]}")

The mode of my series is 11


In [31]:
# min, max and mean, mode and median
print("mean: ")
print(sales["price"].mean())

print("median: ")
print(sales["price"].median())

print("mode: ")
print(sales["price"].mode()[0])

print("minimum: ")
print(sales["price"].min())

print("maximum: ")
print(sales["price"].max())

mean: 
67.06351000000001
median: 
23.74
mode: 
10.44
minimum: 
0.66
maximum: 
1500.05


In [None]:
# variance and standard deviation 
print("variance: ")
print(sales["price"].var())

print("standard deviation: ")
print(sales["price"].std())

In [44]:
sales["product_code"].value_counts().index[:3].to_list()

[5694, 2591, 2645]

In [45]:
x = sales["product_code"].value_counts().index.tolist()
x[:3]

[5694, 2591, 2645]

## Filtering the dataframe

In [46]:
# The main difference between them is the way they access rows and columns:

# loc uses row and column labels.
# iloc uses row and column indexes.

sales = pd.read_csv("sales.csv")
sales.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [57]:
sales[["product_group","product_code","stock_qty"]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241
4,PG2,4718,1401
...,...,...,...
995,PG4,8048,415
996,PG4,8050,-10
997,PG2,952,5388
998,PG2,1307,44996


In [59]:
sales[:5]

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [54]:
sales.loc[:4]

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [49]:
# loc is looking at the column name 
sales.loc[:4,["product_group","product_code","stock_qty"]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241
4,PG2,4718,1401


In [51]:
# above thing with iloc only used the indexes
sales.iloc[:4,[1,0,2]]

Unnamed: 0,product_group,product_code,stock_qty
0,PG2,4187,498
1,PG2,4195,473
2,PG2,4204,968
3,PG2,4219,241


In [52]:
# Get rows from row. no 6 to 9 and initial 2 columns
print(sales.iloc[[5,6,7,8], [0,1]])

   product_code product_group
5          5630           PG4
6          5631           PG4
7          5634           PG4
8          2650           PG4


In [53]:
print(sales.iloc[5:9, :2])

   product_code product_group
5          5630           PG4
6          5631           PG4
7          5634           PG4
8          2650           PG4


Pandas assigns integer labels to rows by default. Unless we specify otherwise, row indexes and labels will be the same.

In [60]:
df = pd.DataFrame(
  np.random.randint(10, size=(4,4)),
  index = ["a","b","c","d"],
  columns = ["col_a","col_b","col_c","col_d"]
  )

print(df)

print("\nSelect two rows and two columns using loc:")
print(df.loc[["b","d"], ["col_a","col_c"]])

   col_a  col_b  col_c  col_d
a      3      9      2      3
b      0      5      2      4
c      2      0      3      8
d      7      1      2      0

Select two rows and two columns using loc:
   col_a  col_c
b      0      2
d      7      2


In [62]:
print("\nSelect two rows and two columns using loc:")
print(df.iloc[[1,3], [0,2]])


Select two rows and two columns using loc:
   col_a  col_c
b      0      2
d      7      2


# Selecting a subset of columns

In [64]:
sales = pd.read_csv("sales.csv")

selected_columns = ["product_code","price"]

print(sales[selected_columns].head())
# or
print(sales[["product_code","price"]].head())

   product_code    price
0          4187   569.91
1          4195   712.41
2          4204   854.91
3          4219  1034.55
4          4718    26.59
   product_code    price
0          4187   569.91
1          4195   712.41
2          4204   854.91
3          4219  1034.55
4          4718    26.59


In [65]:
#pandas expect list otherwise error, see here
print(sales["product_code","price"].head())

KeyError: ('product_code', 'price')

In [68]:
# Even if we want to select only one column, we need to put it in a list.
# Otherwise, Pandas will return a Series instead of a DataFrame with one column.
print(sales["product_code"][:2],type(sales["product_code"]))
print(sales[["product_code"]][:2],type(sales[["product_code"]]))

0    4187
1    4195
Name: product_code, dtype: int64 <class 'pandas.core.series.Series'>
   product_code
0          4187
1          4195 <class 'pandas.core.frame.DataFrame'>


##Filtering by Condition

In [70]:
# The following line of code selects the products that belong to product group PG2.
sales_filtered = sales[sales.product_group == "PG2"]
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [71]:
sales_PG1_filtered = sales[sales["product_group"] == "PG2"]
sales_PG1_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [72]:
# We can use any of the options above, unless there’s a space in the column name.
# In such cases, the first option won’t work.

In [74]:
sales_numeric_filtered = sales[sales["price"] > 100]
sales_numeric_filtered.head()
# The operators we can use to create conditions are:

# ==: equal
# !=: not equal
# >: greater than
# >=: greater than or equal to
# <: less than
# <=: less than or equal to

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
8,2650,PG4,239,59.4,111.06,15,38


In [75]:
# Multiple conditions
# filter the sales data frame # logical and here
sales_filtered = sales[(sales["price"] > 100) & (sales["stock_qty"] < 400)]

print(sales_filtered[["price","stock_qty"]].head())

       price  stock_qty
3    1034.55        241
8     111.06        239
165   208.91        244
186   427.41        369
199   104.49        144


When combining multiple conditions, make sure to put each filter inside parentheses. 
Otherwise, a value error will be generated.

In [76]:
# The | operator is used to combine multiple conditions with OR logic.
sales_filtered = sales[(sales["product_group"] == "PG1") | (sales["product_group"] == "PG2")]
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
4,4718,PG2,1401,12.54,26.59,50,285


In [79]:
# The isin method
# There’s a more practical option, which is the isin method. It accepts a list of values used for filtering.
sales_filtered = sales[sales["product_group"].isin(["PG1","PG2","PG3"])]
print(sales_filtered[["product_code","product_group"]].head())

   product_code product_group
0          4187           PG2
1          4195           PG2
2          4204           PG2
3          4219           PG2
4          4718           PG2


In [82]:
# Finally, we have the not operator (~). It’s used before the name of the DataFrame inside the square brackets.
# We can select the products that aren’t in product groups PG1, PG2, or PG3 as follows:

sales_filtered = sales[~sales["product_group"].isin(["PG1","PG2","PG3"])]
print(sales_filtered[["product_group"]].head())
print(sales_filtered["product_group"].value_counts())

  product_group
5           PG4
6           PG4
7           PG4
8           PG4
9           PG4
PG4    349
PG5    255
PG6    243
Name: product_group, dtype: int64


#The query function

In [85]:
# It differs from previous functions because it can write the conditions as text. 
# It’s quite useful and is more practical in various cases.

sales_filtered = sales.query("price > 100")
sales_filtered.head()

Unnamed: 0,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,4187,PG2,498,420.76,569.91,13,58
1,4195,PG2,473,545.64,712.41,16,58
2,4204,PG2,968,640.42,854.91,22,88
3,4219,PG2,241,869.69,1034.55,14,45
8,2650,PG4,239,59.4,111.06,15,38


In [86]:
# multiple conditions with query
sales_filtered = sales.query("price > 100 and stock_qty < 400")

print(sales_filtered[["product_code","price","stock_qty"]].head())

     product_code    price  stock_qty
3            4219  1034.55        241
8            2650   111.06        239
165          1657   208.91        244
186          7269   427.41        369
199          3530   104.49        144


In [87]:
# be extra careful if writing another string
sales_filtered = sales.query("product_group == 'PG2'")
print(sales_filtered.head())

   product_code product_group  stock_qty    cost    price  last_week_sales  \
0          4187           PG2        498  420.76   569.91               13   
1          4195           PG2        473  545.64   712.41               16   
2          4204           PG2        968  640.42   854.91               22   
3          4219           PG2        241  869.69  1034.55               14   
4          4718           PG2       1401   12.54    26.59               50   

   last_month_sales  
0                58  
1                58  
2                88  
3                45  
4               285  


201