# Functions

In [1]:
def greet():
    print("Hello World!!!")

greet()

Hello World!!!


In [2]:
def hello():
    """This function greets you"""
    print("Hello")
    print("Glad to meet you")

hello()    

Hello
Glad to meet you


## Function with Arguments

In [3]:
# function definition
def find_square(num):
    result = num * num
    return result

# function call
square = find_square(5)
print("Square: ", square)

Square:  25


## Lambda Function

In [4]:
x = lambda a : a + 10
print(x(5))

15


# Lab Activity

## Defining Function

In [5]:
def calculate_carbon_footprint(energy_consumption, emission_factor):
    """
    This function calculates the carbon footprint based on energy consumption (in Kilowatt-hours) and emission factor (in Kg CO2 per KWh).
    """
    return energy_consumption * emission_factor

## Implementation of the Carbon Footprint Calculation Function

In [6]:
energy_consumption = 1000 # in KWh
emission_factor = 0.475 # in Kg CO2 per KWh

carbon_footprint = calculate_carbon_footprint(energy_consumption, emission_factor)
print(f"Carbon Footprint: {carbon_footprint} Kg CO2")

Carbon Footprint: 475.0 Kg CO2


## Lambda Function to Filter Cities Based on Carbon Footprint

In [7]:
# List of cities with their carbon footprints in tons CO2 per month
cities = [
    {"name": "City A", "carbon_footprint": 500},
    {"name": "City B", "carbon_footprint": 350},
    {"name": "City C", "carbon_footprint": 600},
    {"name": "City D", "carbon_footprint": 200},
]

# Lambda function to filter cities below the threshold (400 ton CO2)
sustainability_threshold = 400
sustainable_cities = list(filter(lambda city: city["carbon_footprint"] < sustainability_threshold, cities))

# Printing the filtered list of sustainable cities
print("Sustainable Cities: ")
for city in sustainable_cities:
    print(city["name"])

Sustainable Cities: 
City B
City D


# Lab Activity

## Introduction to PANDAS

In [8]:
!pip install pandas




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pandas as pd

# Sample renewable energy sources data
renewable_energy_sources = ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass"]

# Sample green technologies project data (for DataFrame)
data = {
    "Project": ["Solar Farm A", "Wind Turbines X", "Hydropower Y", "Solar Roof Z", "Geothermal Plant P"],
    "Technology": ["Solar", "Wind", "Hydropower", "Solar", "Geothermal"],
    "Capacity (MW)": [150, 300, 200, 50, 100], # Megawatts
    "Cost (Million $)": [200, 400, 350, 100, 250], # Project Cost
    "Location": ["California", "Texas", "Washington", "Nevada", "Idaho"],
    "Completion Year": [2023, 2024, 2022, 2025, 2023]
}

## Create a Series for Renewable Energy Resources

In [10]:
# Create a Pandas Series for renewavle energy sources
renewable_series = pd.Series(renewable_energy_sources)

# Print the Series
print("Renewable Energy Sources: ")
print(renewable_series)

Renewable Energy Sources: 
0         Solar
1          Wind
2    Hydropower
3    Geothermal
4       Biomass
dtype: object


## Create a DataFrame

In [11]:
# Create a DataFrame for green technologies projects
projects_df = pd.DataFrame(data)

# Print the DataFrame
print("\nGreen Technologies Projects DataFrame: ")
print(projects_df)


Green Technologies Projects DataFrame: 
              Project  Technology  Capacity (MW)  Cost (Million $)  \
0        Solar Farm A       Solar            150               200   
1     Wind Turbines X        Wind            300               400   
2        Hydropower Y  Hydropower            200               350   
3        Solar Roof Z       Solar             50               100   
4  Geothermal Plant P  Geothermal            100               250   

     Location  Completion Year  
0  California             2023  
1       Texas             2024  
2  Washington             2022  
3      Nevada             2025  
4       Idaho             2023  


In [12]:
projects_df.head()

Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year
0,Solar Farm A,Solar,150,200,California,2023
1,Wind Turbines X,Wind,300,400,Texas,2024
2,Hydropower Y,Hydropower,200,350,Washington,2022
3,Solar Roof Z,Solar,50,100,Nevada,2025
4,Geothermal Plant P,Geothermal,100,250,Idaho,2023


In [13]:
projects_df.tail()

Unnamed: 0,Project,Technology,Capacity (MW),Cost (Million $),Location,Completion Year
0,Solar Farm A,Solar,150,200,California,2023
1,Wind Turbines X,Wind,300,400,Texas,2024
2,Hydropower Y,Hydropower,200,350,Washington,2022
3,Solar Roof Z,Solar,50,100,Nevada,2025
4,Geothermal Plant P,Geothermal,100,250,Idaho,2023


## Basic Pandas Operations

In [14]:
# Access the 'Project' column
print("\nList of Projects: ")
print(projects_df["Project"])


List of Projects: 
0          Solar Farm A
1       Wind Turbines X
2          Hydropower Y
3          Solar Roof Z
4    Geothermal Plant P
Name: Project, dtype: object


In [15]:
# Access the 'Technology' column
print("\nList of Technologies: ")
print(projects_df["Technology"])


List of Technologies: 
0         Solar
1          Wind
2    Hydropower
3         Solar
4    Geothermal
Name: Technology, dtype: object


In [16]:
# Filter projects with capacity greater than 100 MW
high_capacity_projects = projects_df[projects_df["Capacity (MW)"] > 100]

print("\nProjects with Capacity Greater than 100 MW: ")
print(high_capacity_projects)


Projects with Capacity Greater than 100 MW: 
           Project  Technology  Capacity (MW)  Cost (Million $)    Location  \
0     Solar Farm A       Solar            150               200  California   
1  Wind Turbines X        Wind            300               400       Texas   
2     Hydropower Y  Hydropower            200               350  Washington   

   Completion Year  
0             2023  
1             2024  
2             2022  


## Add new Column

In [17]:
# Addd a new column for cost per MW
projects_df["Cost per MW"] = projects_df["Cost (Million $)"] / projects_df["Capacity (MW)"]

print("\nDataFrame with Cost per MW: ")
print(projects_df)


DataFrame with Cost per MW: 
              Project  Technology  Capacity (MW)  Cost (Million $)  \
0        Solar Farm A       Solar            150               200   
1     Wind Turbines X        Wind            300               400   
2        Hydropower Y  Hydropower            200               350   
3        Solar Roof Z       Solar             50               100   
4  Geothermal Plant P  Geothermal            100               250   

     Location  Completion Year  Cost per MW  
0  California             2023     1.333333  
1       Texas             2024     1.333333  
2  Washington             2022     1.750000  
3      Nevada             2025     2.000000  
4       Idaho             2023     2.500000  


## Aggregation

In [18]:
# Aggregate the total capacity and cost
total_capacity = projects_df["Capacity (MW)"].sum()
total_cost = projects_df["Cost (Million $)"].sum()

print(f"\nTotal Capacity of all projects: {total_capacity} MW")
print(f"Total Cost of all projects: ${total_cost} Million")


Total Capacity of all projects: 800 MW
Total Cost of all projects: $1300 Million


## Grouping the Data

In [19]:
# Group by 'Technology' and calculate the total capacity for each type
grouped_data = projects_df.groupby("Technology")["Capacity (MW)"].sum()

print("\nTotal Capacity by Technology: ")
print(grouped_data)


Total Capacity by Technology: 
Technology
Geothermal    100
Hydropower    200
Solar         200
Wind          300
Name: Capacity (MW), dtype: int64


# Lab Activity

## Introduction to NumPy

In [20]:
import numpy as np
a = np.array([0, 1, 2, 3])
print(a)
print(type(a))
print(a.ndim) # dimension of array
print(a.shape) # shape(row, cloumn) of array
print(len(a))

[0 1 2 3]
<class 'numpy.ndarray'>
1
(4,)
4


### Create 2-D array using numpy from list

In [21]:
mylist = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]
np.array(mylist)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

### Create 1-D and 2-D array using random values

In [22]:
np.random.rand(5) # uniform distribution
np.random.rand(4, 4) # create 2-D array of range
np.random.randn(4) # standard normal distribution

array([2.0069417 , 1.580927  , 2.20572978, 1.57393634])

## Import NumPy and Create a Dataset

In [23]:
import numpy as np

# Energy consumption in MWh for different renewable sources: Solar, Wind, Hydropower, Geothermal, Biomass
energy_consumption = np.array([1200, 3400, 2900, 1800, 2500])

# Print the array
print("Energy Consumption (in MWh) for Different Renewable Sources: ")
print(energy_consumption)

Energy Consumption (in MWh) for Different Renewable Sources: 
[1200 3400 2900 1800 2500]


## Calculate Total Energy Consumption

In [24]:
# Calculate the total energy consumption
total_energy_consumption = np.sum(energy_consumption)

print(f"\nTotal Energy Consumption: {total_energy_consumption} MWh")


Total Energy Consumption: 11800 MWh


## Calculate the Mean (Average) Energy Consumption

In [25]:
# Calculate the mean energy consumption
mean_consumption = np.mean(energy_consumption)

print(f"\nMean Energy Consumption: {mean_consumption:.2f} MWh")


Mean Energy Consumption: 2360.00 MWh


## Calculate the Standard Deviation of Energy Consumption

In [26]:
# Calculate the standard deviation of energy consumption
std_deviation = np.std(energy_consumption)

print(f"\nStandard Deviation of Energy Consumption: {std_deviation:.2f} MWh")


Standard Deviation of Energy Consumption: 781.28 MWh


## Reshaping the Array

In [27]:
# Reshape the array (to 5 rows and 1 column)
reshaped_array = energy_consumption.reshape(5, 1)

print("\nReshaped Energy Consumption Array(5X1): ")
print(reshaped_array)


Reshaped Energy Consumption Array(5X1): 
[[1200]
 [3400]
 [2900]
 [1800]
 [2500]]


# Lab

## Import Pandas and Create a Dataset with Missing Values

In [28]:
import pandas as pd
import numpy as np

# Sample data with missing values
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

energy_df = pd.DataFrame(data)

print("Original Energy Data with Missing Values: ")
print(energy_df)

Original Energy Data with Missing Values: 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                       NaN             400.0
2    Hydropower                    2900.0               NaN
3    Geothermal                       NaN             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0               NaN


In [29]:
energy_df.isnull()

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,False,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,False,False,False
5,False,False,True


In [30]:
energy_df.isnull().sum()

Energy Source               0
Energy Consumption (MWh)    2
Cost (Million $)            2
dtype: int64

In [31]:
energy_df.isnull().sum().sum()

np.int64(4)

## Remove Rows with Missing Values

In [32]:
# Remove rows with any missing values
cleaned_df = energy_df.dropna()

print("\nDataFrame after Removing Rows with Missing Values: ")
print(cleaned_df)


DataFrame after Removing Rows with Missing Values: 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
4       Biomass                    2500.0             250.0


## Impute Missing Values with the Mean

In [33]:
# Impute missing values in 'Energy Consumption (MWh)' with the mean
energy_df["Energy Consumption (MWh)"].fillna(energy_df["Energy Consumption (MWh)"].mean(), inplace=True)

# Impute missing values in 'Cost (Million $)' with the mean
energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)

print("\nData after Imputing Missing Values with Mean: ")
print(energy_df)


Data after Imputing Missing Values with Mean: 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (MWh)"].fillna(energy_df["Energy Consumption (MWh)"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Cost (Million $)"].fillna(energy_df["Cost (Million $)"].mean(), inplace=True)


## Forward/Backward Filling

In [34]:
# Forward fill missing values
forward_filled_df = energy_df.fillna(method="ffill")    

print("\nData After Forward Filling Missing Values:")
print(forward_filled_df)


Data After Forward Filling Missing Values:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0


  forward_filled_df = energy_df.fillna(method="ffill")


## Flag Missing Values

In [35]:
# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
energy_df["Missing Consumption"] = energy_df["Energy Consumption (MWh)"].isna().astype(int)

print("\nData with Missing Values Flagged: ")
print(energy_df)


Data with Missing Values Flagged: 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                    1200.0             200.0   
1          Wind                    2450.0             400.0   
2    Hydropower                    2900.0             250.0   
3    Geothermal                    2450.0             150.0   
4       Biomass                    2500.0             250.0   
5       Nuclear                    3200.0             250.0   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


## Normalization (Min-Max Scaling)

In [36]:
!pip install scikit-learn


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [37]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the 'Energy Consumption (MWh)' and 'Cost (Million $)' columns
scaler = MinMaxScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])

print("\nData After Normalization (Min-Max Scaling): ")
print(energy_df)


Data After Normalization (Min-Max Scaling): 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                     0.000               0.2   
1          Wind                     0.625               1.0   
2    Hydropower                     0.850               0.4   
3    Geothermal                     0.625               0.0   
4       Biomass                     0.650               0.4   
5       Nuclear                     1.000               0.4   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


## Standardization (Z-score Scaling)

In [38]:
from sklearn.preprocessing import StandardScaler

# Standardize the 'Energy Consumption (MWh)' and 'Cost (Million $)' columns
scaler = StandardScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])

print("\nData After Standardization (Z-score Scaling): ")
print(energy_df)


Data After Standardization (Z-score Scaling): 
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar             -2.005893e+00     -6.546537e-01   
1          Wind              3.563181e-16      1.963961e+00   
2    Hydropower              7.221213e-01      1.817029e-16   
3    Geothermal              3.563181e-16     -1.309307e+00   
4       Biomass              8.023570e-02      1.817029e-16   
5       Nuclear              1.203536e+00      1.817029e-16   

   Missing Consumption  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  


## Encoding Categorical Values

In [39]:
# One-hot encode the 'Energy Source' column
energy_encoded_df = pd.get_dummies(energy_df, columns=["Energy Source"])

print("\nData After One-Hot Encoding Categorical Values: ")
print(energy_encoded_df)


Data After One-Hot Encoding Categorical Values: 
   Energy Consumption (MWh)  Cost (Million $)  Missing Consumption  \
0             -2.005893e+00     -6.546537e-01                    0   
1              3.563181e-16      1.963961e+00                    0   
2              7.221213e-01      1.817029e-16                    0   
3              3.563181e-16     -1.309307e+00                    0   
4              8.023570e-02      1.817029e-16                    0   
5              1.203536e+00      1.817029e-16                    0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False         

## Feature Engineering

In [40]:
# Create a new feature: Energy Consumption per Million $
energy_encoded_df["Consumption per $Million"] = energy_encoded_df["Energy Consumption (MWh)"] / energy_encoded_df["Cost (Million $)"]

print("\nData with New Feature (Consumption per $Million): ")
print(energy_encoded_df)


Data with New Feature (Consumption per $Million): 
   Energy Consumption (MWh)  Cost (Million $)  Missing Consumption  \
0             -2.005893e+00     -6.546537e-01                    0   
1              3.563181e-16      1.963961e+00                    0   
2              7.221213e-01      1.817029e-16                    0   
3              3.563181e-16     -1.309307e+00                    0   
4              8.023570e-02      1.817029e-16                    0   
5              1.203536e+00      1.817029e-16                    0   

   Energy Source_Biomass  Energy Source_Geothermal  Energy Source_Hydropower  \
0                  False                     False                     False   
1                  False                     False                     False   
2                  False                     False                      True   
3                  False                      True                     False   
4                   True                     False       