In [2]:
import pandas as pd

In [15]:
file_dir = 'datasets'
file_name = 'data.csv'

import os

full_path = os.path.join(file_dir,file_name)

df = pd.read_csv(full_path)
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [5]:
# to find all of the data types of values present
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

In [9]:
# in data analysis missing values needs to be handeled
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [7]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [10]:
df_filled = df.fillna(0)
df_filled.isnull().any()

Date        False
Category    False
Value       False
Product     False
Sales       False
Region      False
dtype: bool

In [16]:
df['Sales_fillNA'] = df['Sales'].fillna(df['Sales'].mean())
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [17]:
# renaming of columns 
df = df.rename(columns = {'Sales_fillNA': 'Sales', 'Sales': 'Sales_NA'})
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales_NA,Region,Sales
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


In [18]:
# changing the datatypes

df['Value_new'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales_NA,Region,Sales,Value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26


In [21]:
# also you can apply calculations for columns and create new ones

df['value_squared'] = df['Value_new'].apply(lambda x:x**2)
df.head(5)
# this can be rexecuted and will update the columns if only formula was changed

Unnamed: 0,Date,Category,Value,Product,Sales_NA,Region,Sales,Value_new,value_squared
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,784
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,1521
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,1024
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,64
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,676


In [22]:
# it is also possible to group dataframes for better insight

grouped_mean = df.groupby('Category')['Value_new'].sum()
grouped_mean

Category
A     677
B     669
C    1239
Name: Value_new, dtype: int64

In [25]:
# this can be applied for multiple columns and using multiple operations

grouped_reg_cat = df.groupby(['Category', 'Region'])['Value_new'].mean()
grouped_reg_cat

Category  Region
A         East      47.857143
          South     70.000000
          West      45.333333
B         East      11.500000
          North     37.000000
          South     34.500000
          West      65.333333
C         East      48.000000
          North     42.666667
          South     71.400000
          West      72.333333
Name: Value_new, dtype: float64

In [26]:
# and now for multiple operations
grouped_mean = df.groupby('Category')['Value_new'].agg(['sum','count','mean'])
grouped_mean

Unnamed: 0_level_0,sum,count,mean
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,677,14,48.357143
B,669,15,44.6
C,1239,21,59.0


In [27]:
# also another important feature that pandas have is joins and merges

df1 = pd.DataFrame({'Key': ['A', 'B', 'C'], 'Value1': [1, 2, 3]})
df2 = pd.DataFrame({'Key': ['A', 'B', 'D'], 'Value2': [4, 5, 6]})

In [30]:
pd.merge(df1,df2,on='Key',how='inner')
# this supports all of the classical joins including outer, left and right with inner already written

Unnamed: 0,Key,Value1,Value2
0,A,1,4
1,B,2,5
