In [2]:
import pandas as pd

In [3]:
# Creating a Series
city_series = pd.Series(["Tokyo", "Los Angeles", "London"], name="City")
print(f"Pandas Series:\n{city_series}\n")

Pandas Series:
0          Tokyo
1    Los Angeles
2         London
Name: City, dtype: object



In [5]:
# Creating a DataFrame
data = {
    "City": ["Tokyo", "Los Angeles", "London"],
    "Latitude": [35.6895, 34.0522, 51.5074],
    "Longitude": [139.6917, -118.2437, -0.1278],
}
df = pd.DataFrame(data)
print(f"Pandas DataFrame:\n{df}")
df

Pandas DataFrame:
          City  Latitude  Longitude
0        Tokyo   35.6895   139.6917
1  Los Angeles   34.0522  -118.2437
2       London   51.5074    -0.1278


Unnamed: 0,City,Latitude,Longitude
0,Tokyo,35.6895,139.6917
1,Los Angeles,34.0522,-118.2437
2,London,51.5074,-0.1278


In [11]:
#selection one column from data frame

lat = df["Latitude"]
lat

0    35.6895
1    34.0522
2    51.5074
Name: Latitude, dtype: float64

In [15]:
df_fil = df[df["Latitude"] > 50] # this will return a dataframe whereas df["Latitude"] > 50 will return Boolean
df_fil

Unnamed: 0,City,Latitude,Longitude
2,London,51.5074,-0.1278


In [18]:
#adding a new column
import numpy as np
df["Radian_Lat"] = np.radians(df["Latitude"])
df

Unnamed: 0,City,Latitude,Longitude,Radian_Lat
0,Tokyo,35.6895,139.6917,0.622899
1,Los Angeles,34.0522,-118.2437,0.594323
2,London,51.5074,-0.1278,0.898974


In [19]:
# Creating a DataFrame
data = {
    "City": ["Tokyo", "Los Angeles", "London", "Paris", "Chicago"],
    "Country": ["Japan", "USA", "UK", "France", "USA"],
    "Population": [37400068, 3970000, 9126366, 2140526, 2665000],
}
df = pd.DataFrame(data)
df

Unnamed: 0,City,Country,Population
0,Tokyo,Japan,37400068
1,Los Angeles,USA,3970000
2,London,UK,9126366
3,Paris,France,2140526
4,Chicago,USA,2665000


In [21]:
groupbydf = df.groupby("Country")["Population"].sum()
groupbydf

Country
France     2140526
Japan     37400068
UK         9126366
USA        6635000
Name: Population, dtype: int64

In [26]:
# Creating two DataFrames to merge
df1 = pd.DataFrame(
    {"City": ["Tokyo", "Los Angeles", "London"], "Country": ["Japan", "USA", "UK"]}
)
df2 = pd.DataFrame(
    {
        "City": ["Tokyo", "Los Angeles", "London"],
        "Population": [37400068, 3970000, 9126366],
    }
)

In [24]:
df1

Unnamed: 0,City,Country
0,Tokyo,Japan
1,Los Angeles,USA
2,London,UK


In [25]:
df2

Unnamed: 0,City,Population
0,Tokyo,37400068
1,Los Angeles,3970000
2,London,9126366


In [29]:
#Panda Merge
dfmerge = pd.merge(df1,df2, on = "City")
dfmerge

Unnamed: 0,City,Country,Population
0,Tokyo,Japan,37400068
1,Los Angeles,USA,3970000
2,London,UK,9126366


### Handling Missing values in Pandas

In [30]:
# Creating a DataFrame with missing values
data_with_nan = {
    "City": ["Tokyo", "Los Angeles", "London", "Paris"],
    "Population": [37400068, 3970000, None, 2140526],
}
df_nan = pd.DataFrame(data_with_nan)
df_nan

Unnamed: 0,City,Population
0,Tokyo,37400068.0
1,Los Angeles,3970000.0
2,London,
3,Paris,2140526.0


In [31]:
# Fill missing values with the mean population
df_filled = df_nan.fillna(df_nan["Population"].mean())
df_filled

Unnamed: 0,City,Population
0,Tokyo,37400070.0
1,Los Angeles,3970000.0
2,London,14503530.0
3,Paris,2140526.0


In [34]:
# Define the Haversine formula using NumPy
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = (
        np.sin(dlat / 2) ** 2
        + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    )
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance


# Create a new DataFrame with city pairs
city_pairs = pd.DataFrame(
    {
        "City1": ["Tokyo", "Tokyo", "Los Angeles"],
        "City2": ["Los Angeles", "London", "London"],
        "Lat1": [35.6895, 35.6895, 34.0522],
        "Lon1": [139.6917, 139.6917, -118.2437],
        "Lat2": [34.0522, 51.5074, 51.5074],
        "Lon2": [-118.2437, -0.1278, -0.1278],
    }
)
city_pairs

Unnamed: 0,City1,City2,Lat1,Lon1,Lat2,Lon2
0,Tokyo,Los Angeles,35.6895,139.6917,34.0522,-118.2437
1,Tokyo,London,35.6895,139.6917,51.5074,-0.1278
2,Los Angeles,London,34.0522,-118.2437,51.5074,-0.1278


In [35]:
# Define a function to calculate distances from a city to all other cities
def calculate_average_distance(df):
    lat1 = df["Latitude"].values
    lon1 = df["Longitude"].values
    lat2, lon2 = np.meshgrid(lat1, lon1)
    distances = haversine_np(lat1, lon1, lat2, lon2)
    avg_distances = np.mean(distances, axis=1)
    return avg_distances


# Creating a DataFrame
data = {
    "City": ["Tokyo", "Los Angeles", "London"],
    "Latitude": [35.6895, 34.0522, 51.5074],
    "Longitude": [139.6917, -118.2437, -0.1278],
}
df = pd.DataFrame(data)
#df
# Apply the function to calculate average distances
df["Avg_Distance_km"] = calculate_average_distance(df)
df

Unnamed: 0,City,Latitude,Longitude,Avg_Distance_km
0,Tokyo,35.6895,139.6917,5624.60139
1,Los Angeles,34.0522,-118.2437,5294.682354
2,London,51.5074,-0.1278,7041.924003
