#### Data Manipulation and Analysis with Pandas

Data manipulation and analysis are key tasks in any data science or data analysis projects.

Pandas provides wide range of functions to perform Data Manipulation and Analysis, making it easier to clean, transform, and extract insights from the data

In [34]:
import pandas as pd

In [35]:
df = pd.read_csv('Energy_Production_Dataset.csv')

## fetch the first 5 rows of the data
df.head(5)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0


In [36]:
## decribe the data
df.describe()

Unnamed: 0,Start_Hour,End_Hour,Day_of_Year,Production
count,51863.0,51864.0,51864.0,51862.0
mean,11.500106,11.5,180.798415,6214.907022
std,6.922278,6.922253,104.291387,3978.226203
min,0.0,0.0,1.0,58.0
25%,5.5,5.75,91.0,3111.0
50%,12.0,11.5,181.0,5372.0
75%,17.5,17.25,271.0,8500.75
max,23.0,23.0,366.0,23446.0


In [37]:
## To get the data types

df.dtypes

Date            object
Start_Hour     float64
End_Hour         int64
Source          object
Day_of_Year      int64
Day_Name        object
Month_Name      object
Season          object
Production     float64
dtype: object

In [45]:
## Handling the missing values

df.isnull()

## True = Value is missing

## False = Value is present

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...
51859,False,False,False,False,False,False,False,False,False
51860,False,False,False,False,False,False,False,False,False
51861,False,False,False,False,False,False,False,False,False
51862,False,False,False,False,False,False,False,False,False


In [47]:

df.isnull().any()

## By default axis=0, so it goes through the row index (Aggregates Columns)
## i.e., if all the rows in a column is False, then the result is False, else True

Date           False
Start_Hour      True
End_Hour       False
Source          True
Day_of_Year    False
Day_Name        True
Month_Name      True
Season         False
Production      True
dtype: bool

In [48]:
## By setting axis=1, so it goes through the column index (Aggregates Rows)
## i.e., if all the columns values in a row is False, then the result is False, else True

df.isnull().any(axis=1)

0         True
1        False
2        False
3         True
4         True
         ...  
51859    False
51860    False
51861    False
51862    False
51863    False
Length: 51864, dtype: bool

In [46]:
## To get the number of rows that each attribute has a missing value

df.isnull().sum()

Date           0
Start_Hour     1
End_Hour       0
Source         1
Day_of_Year    0
Day_Name       1
Month_Name     1
Season         0
Production     2
dtype: int64

In [59]:
df.head(5)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0


In [60]:
## To fill all the missing values (NaN) with 0
## In real-time, we usually fill the missing values with the mean of that column

df.fillna(0)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,0.0
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0
3,11/30/2025,23.0,0,0,334,Sunday,November,Fall,6120.0
4,11/30/2025,6.0,7,Wind,334,0,November,Fall,4387.0
...,...,...,...,...,...,...,...,...,...
51859,1/1/2020,4.0,5,Wind,1,Wednesday,January,Winter,2708.0
51860,1/1/2020,18.0,19,Wind,1,Wednesday,January,Winter,1077.0
51861,1/1/2020,7.0,8,Wind,1,Wednesday,January,Winter,2077.0
51862,1/1/2020,14.0,15,Solar,1,Wednesday,January,Winter,1783.0


In [62]:
## Copy the filled DataFrame

new_df = df.fillna(0)
new_df

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,0.0
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0
3,11/30/2025,23.0,0,0,334,Sunday,November,Fall,6120.0
4,11/30/2025,6.0,7,Wind,334,0,November,Fall,4387.0
...,...,...,...,...,...,...,...,...,...
51859,1/1/2020,4.0,5,Wind,1,Wednesday,January,Winter,2708.0
51860,1/1/2020,18.0,19,Wind,1,Wednesday,January,Winter,1077.0
51861,1/1/2020,7.0,8,Wind,1,Wednesday,January,Winter,2077.0
51862,1/1/2020,14.0,15,Solar,1,Wednesday,January,Winter,1783.0


In [64]:
df.describe()

Unnamed: 0,Start_Hour,End_Hour,Day_of_Year,Production
count,51863.0,51864.0,51864.0,51862.0
mean,11.500106,11.5,180.798415,6214.907022
std,6.922278,6.922253,104.291387,3978.226203
min,0.0,0.0,1.0,58.0
25%,5.5,5.75,91.0,3111.0
50%,12.0,11.5,181.0,5372.0
75%,17.5,17.25,271.0,8500.75
max,23.0,23.0,366.0,23446.0


In [66]:
## To fill the missing values with the mean of the respective column

df['New_Production'] = df['Production'].fillna(df['Production'].mean())
df.head(5)
## In this, we can clearly see that, Production value for the 0th row has been replaced with mean of the production and stored in New_Production column

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production,New_Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,,6214.907022
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0,3824.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0,6120.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0,4387.0


In [68]:
df.isnull().any()

## Here, New_Production column is False, since all the missing values were replaced with mean of Production column

Date              False
Start_Hour         True
End_Hour          False
Source             True
Day_of_Year       False
Day_Name           True
Month_Name         True
Season            False
Production         True
New_Production    False
dtype: bool

#### To Rename the columns and perform Data Type conversions

In [None]:
df.dtypes

Date               object
Start_Hour        float64
End_Hour            int64
Source             object
Day_of_Year         int64
Day_Name           object
Month_Name         object
Season             object
Production        float64
New_Production    float64
dtype: object

In [71]:
## Renaming the columns

df = df.rename(columns={'New_Production':'Updated_Production'})
df.head(5)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production,Updated_Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,,6214.907022
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0,3824.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0,3824.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0,6120.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0,4387.0


In [73]:
## To change the data type of Start_Hour column

df['New_Start_Hour'] = df['Start_Hour'].astype(int)
df.head(5)

## Here the error is because of the missing value in Start_Hour column

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [76]:
df.isnull().any()

## True value in Start_Hour indicates that it is having missing values

Date                  False
Start_Hour             True
End_Hour              False
Source                 True
Day_of_Year           False
Day_Name               True
Month_Name             True
Season                False
Production             True
Updated_Production    False
New_Start_Hour        False
dtype: bool

In [74]:
## To change the data type after replacing the missing value with the mean of that column

df['New_Start_Hour'] = df['Start_Hour'].fillna(df['Start_Hour'].mean()).astype(int)
df.head(5)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production,Updated_Production,New_Start_Hour
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,,6214.907022,21
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0,3824.0,18
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0,3824.0,16
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0,6120.0,23
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0,4387.0,6


In [80]:
df.isnull().any()

## Here, Start_Hour = True, indicates missing values
## New_Strat_Hour = False, indicates no missing values becuase we have replaced them with mean values

Date                  False
Start_Hour             True
End_Hour              False
Source                 True
Day_of_Year           False
Day_Name               True
Month_Name             True
Season                False
Production             True
Updated_Production    False
New_Start_Hour        False
dtype: bool

In [82]:
df.dtypes

## Here Start_Hour is of float64 type
## Whereas New_Start_Hour is of int64 type

Date                   object
Start_Hour            float64
End_Hour                int64
Source                 object
Day_of_Year             int64
Day_Name               object
Month_Name             object
Season                 object
Production            float64
Updated_Production    float64
New_Start_Hour          int64
dtype: object

In [None]:
## To apply changes to the values in all the rows for a particular column
## For example, to increase the production to 2X

df['2X_Production'] = df['Updated_Production'].apply(lambda x:x*2)
df.head(5)

## Here, we used lambda function i.e., custom or anonymous function.
## Likewise, we can also use any function as per our requirement by passing the parameters

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production,Updated_Production,New_Start_Hour,2X_Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,,6214.907022,21,12429.814045
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0,3824.0,18,7648.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0,3824.0,16,7648.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0,6120.0,23,12240.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0,4387.0,6,8774.0


In [85]:
## Custom defined function for 3X Production

def Production3(val):
    return val*3

In [94]:
## To increase production to 3X using the function object

df['3X_Production'] = df['Production'].apply(Production3)
df.head(5)

Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production,Updated_Production,New_Start_Hour,2X_Production,3X_Production
0,11/30/2025,21.0,22,Wind,334,Sunday,November,Fall,,6214.907022,21,12429.814045,
1,11/30/2025,18.0,19,Wind,334,Sunday,November,Fall,3824.0,3824.0,18,7648.0,11472.0
2,11/30/2025,16.0,17,Wind,334,Sunday,November,Fall,3824.0,3824.0,16,7648.0,11472.0
3,11/30/2025,23.0,0,,334,Sunday,November,Fall,6120.0,6120.0,23,12240.0,18360.0
4,11/30/2025,6.0,7,Wind,334,,November,Fall,4387.0,4387.0,6,8774.0,13161.0


#### Data Aggregating and Grouping

In [106]:
## To calculate the monthly Production

grouped_sum = df.groupby('Month_Name')['Production'].sum()
grouped_sum

Month_Name
April        27277879.0
August       21897618.0
December     27395733.0
February     31792273.0
January      30851764.0
July         23352391.0
June         19813461.0
March        31893888.0
May          25914666.0
November     29782797.0
October      29499405.0
September    22838624.0
Name: Production, dtype: float64

In [116]:
## To calculate the monthly production with days in a week

weekday_sum = df.groupby(['Month_Name', 'Day_Name'])['Production'].sum()
weekday_sum


Month_Name  Day_Name 
April       Friday       3714010.0
            Monday       3830308.0
            Saturday     4185430.0
            Sunday       3505450.0
            Thursday     3756332.0
                           ...    
September   Saturday     2797551.0
            Sunday       2843334.0
            Thursday     3376758.0
            Tuesday      3332104.0
            Wednesday    3489874.0
Name: Production, Length: 84, dtype: float64

In [117]:
## To calculate the mean of the monthly production with days in a week

weekday_mean = df.groupby(['Month_Name', 'Day_Name'])['Production'].mean()
weekday_mean


Month_Name  Day_Name 
April       Friday       5951.939103
            Monday       6383.846667
            Saturday     6707.419872
            Sunday       5842.416667
            Thursday     6019.762821
                            ...     
September   Saturday     4662.585000
            Sunday       4738.890000
            Thursday     5411.471154
            Tuesday      5339.910256
            Wednesday    5592.746795
Name: Production, Length: 84, dtype: float64

In [123]:
## Aggregate multiple functions

grouped_agg = df.groupby('Month_Name')['Production'].aggregate(['sum', 'mean', 'count'])
grouped_agg

Unnamed: 0_level_0,sum,mean,count
Month_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
April,27277879.0,6314.323843,4320
August,21897618.0,4905.380376,4464
December,27395733.0,7364.444355,3720
February,31792273.0,7792.223775,4080
January,30851764.0,6911.237455,4464
July,23352391.0,5231.270385,4464
June,19813461.0,4586.449306,4320
March,31893888.0,7154.304172,4458
May,25914666.0,5805.25672,4464
November,29782797.0,6898.956915,4317


#### Merging and Joining DataFrames

In [124]:
## Creating 2 sample DataFrames
df1 = pd.DataFrame({'Key':['A', 'B', 'C'], 'Value1':[1,2,3]})
df2 = pd.DataFrame({'Key': ['A', 'B', 'D'], 'Value2': [1,2,4]})

In [126]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [127]:
df2

Unnamed: 0,Key,Value2
0,A,1
1,B,2
2,D,4


In [132]:
## To perform merge operations

pd.merge(df1,df2, on='Key', how = 'inner')
## It is like Intersection operation in sets

Unnamed: 0,Key,Value1,Value2
0,A,1,1
1,B,2,2


In [134]:
pd.merge(df1, df2, on='Key', how='outer')
## It is like union operation in sets

Unnamed: 0,Key,Value1,Value2
0,A,1.0,1.0
1,B,2.0,2.0
2,C,3.0,
3,D,,4.0


In [137]:
pd.merge(df1, df2, on='Key', how='left')
## left outer join i.e., all the elements from A and only matched elements from B

Unnamed: 0,Key,Value1,Value2
0,A,1,1.0
1,B,2,2.0
2,C,3,


In [138]:
pd.merge(df1, df2, on='Key', how = 'right')
## right outer join i.e., all the elements from B and only matched elements from A

Unnamed: 0,Key,Value1,Value2
0,A,1.0,1
1,B,2.0,2
2,D,,4
