# IMPORTS and GENERAL FUNCS

In [3]:
import geopandas  as gpd
import numpy      as np
import pandas     as pd
import shapely    as shape
import os
from decorators import time_it

print("GeoPandas Version: ", gpd.__version__)
print("Numpy Version: ", np.__version__)
print("Pandas Version: ", pd.__version__)
print("Shapely Version: ", shape.__version__)

def sep() -> str:
  return print('----------------------------------------------------------------------------------------------------------------')

GeoPandas Version:  1.0.1
Numpy Version:  1.25.1
Pandas Version:  2.2.2
Shapely Version:  2.0.6


# CREATING DATAFRAMES

#### From a Shapefile

In [None]:
@time_it
def ShpToDF(shp_file):
  gdf       = gpd.read_file(shp_file)
  df        = gdf.drop(columns='geometry')
  return df

cwd            = os.getcwd()
shp_file_path  = os.path.join(cwd, 'shapefiles', 'geo_data.shp')

df = ShpToDF(shp_file_path)

# print(df)
# print(type(df))
print("\n", df.to_string())


#### From a CSV File

In [None]:
@time_it
def CsvToDF(csv_file):
  df        = pd.read_csv(csv_file)
  return df

cwd           = os.getcwd()
csv_filepath  = os.path.join(cwd, 'geo_data.csv')

df = CsvToDF(csv_filepath)
# print(df.head(n=10))
# print(type(df))
print("\n", df.to_string())

#### From JSON File

In [None]:
@time_it
def JsonToDF(json_file):
  df        = pd.read_json(json_file)
  return df

cwd             = os.getcwd()
json_filepath   = os.path.join(cwd, 'geo_data.json')

df = JsonToDF(json_filepath)
# print(df.head(n=10))
# print(type(df))
print("\n", df.to_string())

#### From GeoJson File

In [None]:
# Using the geopandas module
@time_it
def GeoJsonToDF(geojson_file):
  gdf       = gpd.read_file(geojson_file)
  gdf       = gpd.GeoDataFrame(gdf)
  # Separate the 'geometry' column into individual 'Latitude' and 'Longitude' columns, drop 'geometry' column
  gdf['Latitude']   = gdf.geometry.y
  gdf['Longitude']  = gdf.geometry.x
  df                = gdf.drop(columns='geometry')
  # Reorder columns to move 'Latitude' and 'Longitude' columns between 'Name' and 'LandUse' columns
  df = df[['ID', 'Name', 'Latitude', 'Longitude', 'LandUse', 'Area_sq_km', 'Population', 'Elevation_m']]
  return df
  

cwd               = os.getcwd()
geojson_filepath  = os.path.join(cwd, 'geo_data.geojson')

df = GeoJsonToDF(geojson_filepath)
# print(gdf.head(n=10))
# print(type(df))
print("\n", df.to_string())

#### From a Dictionary

In [None]:
df_1 = pd.DataFrame(
  { "a" : [4, 5, 6],
    "b" : [7, 8, 9],
    "c" : [10, 11, 12],
  },
    index = [1, 2, 3],
)

print(df_1)
df_1.plot.area()
df_1.plot.bar()
df_1.plot.barh()
df_1.plot.box()
df_1.plot.density()
df_1.plot.kde()
df_1.plot.line()
df_1.plot.pie(subplots=True)

In [None]:
df_4_data: dict = {
  "Name": [
    "Braund, Mr. Owen Harris",
    "Allen, Mr. William Henry",
    "Bonnell, Miss. Elizabeth"
  ],
  "Age": [22, None, 58],
  "Sex": ["male", "male", "female"],
}

df = pd.DataFrame(df_4_data)
print(df)

print("\n")

df_age = df["Age"]
print(df_age)

#### From a List of Lists

In [None]:
df_2 = pd.DataFrame(
  data=[
    [4, 7, 10],
    [5, 8, 11], 
    [6, 9, 12],
  ], 
  index=[1, 2, 3],
  columns=['a', 'b', 'c']
)

print(df_2)

#### Multi-Indexing

In [4]:
# MultiIndexing
df_3_data: dict = {
  "a" : [4, 5, 6],
  "b" : [7, 8, 9],
  "c" : [10, 11, 12]
}

df_3_index = pd.MultiIndex.from_tuples(
  [
    ("d", 1),("d", 2),
    ("e", 2)
  ], 
  names=["n", "v"]
)

table = pd.DataFrame(data=df_3_data, index=df_3_index)

print(table)

     a  b   c
n v          
d 1  4  7  10
  2  5  8  11
e 2  6  9  12


#### From dict of Series or dicts

In [None]:
d_4_data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
df_a = pd.DataFrame(d_4_data)
print(df_a)

print("\n")

df_b = pd.DataFrame(d_4_data, index=["d", "b", "a"], columns=["two", "three"])
print(df_b)

# CREATE A SERIES FROM SCRATCH

In [None]:
ages = pd.Series([22, 35, 58], name="Age")
print(ages)

In [None]:
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print(s1)

In [None]:
print(s1.index)

In [None]:
s2 = pd.Series(np.random.randn(5))
print(s2)

In [None]:
# Series instantiated from dicts:
d = {"b": 1, "a": 0, "c": 2}
s3 = pd.Series(d)
print(s3)

In [None]:
# If an index is passed, the values in data corresponding to the labels in the index will be pulled out
s4 = pd.Series(d, index=["b", "c", "d", "a"])
print(s4)

# DO SOMETHING WITH THE DATAFRAME

#### Accessing Data from the DataFrame

In [None]:
'''
SCENARIO: I am interestined in some basic statistics of the numerical data of my data table and accessing it
'''
# NOTE: use the following examples with the csv dataframe generated in <CREATING DATAFRAMES: FROM CSV FILE> section above
print("DataFrame:\n", df.to_string())
sep()

overview_data = df.describe()
print("Summary Statistics:\n", overview_data)
sep()

# Two ways to get mean/avg population
print("Avg Population:\t\t", overview_data["Population"]["mean"])
mean_population = df["Population"].mean()
print("Mean Population:\t", mean_population)

print("Avg Area: ", overview_data["Area_sq_km"]["mean"])

count_parks = df["LandUse"].value_counts()
print("Parks Count: ", count_parks["Parks"])

#### Conditional Updating

In [None]:
# Print working DataFrame
print("Working DataFrame: \n", df.to_string())
sep()

# Creates a boolean mask for updating column LandUse based on values in column LandUse
df.loc[df.LandUse == "Parks", "LandUse"] = "ParkRec"
print("Updated LandUse Labels:\n", df.to_string())
sep()

# Create a new column based on values
# df.loc[df.Area_sq_km >= 1.49, "SizeCat" ] = "Large"
# df.loc[df.Area_sq_km < 1.49, "SizeCat" ]  = "Small"
df["SizeCat"] = np.where(df.Area_sq_km >= 1.49, "Large", "Small") # !!!: OneLiner using NumPy
print("Updated w/Size Category:\n", df.to_string())
sep()

# Select all Residential assets and display records
residential = df.loc[df['LandUse'] == "Residential"]
print("Residential Assets:\n", residential.to_string())
sep()

# Check mutated DataFrame 
print("Mutated DataFrame:\n", df.to_string())

# EXPORT DATAFRAMES

#### Export to CSV

In [None]:
cwd           = os.getcwd()
csv_output    = os.path.join(cwd, 'update_geo_data.csv')
df.to_csv(csv_output, 
          index=False, 
          columns=['ID', 'Name', 'Latitude', 'Longitude','LandUse','Area_sq_km', 'SizeCat', 'Population','Elevation_m']
        ) # NOTE: if reordering columns, use columns parameter as shown