In [1]:
import importlib

def check_and_install(module_name):
    try:
        importlib.import_module(module_name)
        print(f"{module_name} is already installed.")
    except ImportError:
        print(f"{module_name} is not installed. Installing...")
        import subprocess
        subprocess.check_call(["pip", "install", module_name])

# List of required modules
required_modules = [
    "pandas",
    "numpy",
    "sqlalchemy",
    "geoalchemy2",
    "shapely"
]

# Check and install each required module
for module in required_modules:
    check_and_install(module)

pandas is already installed.
numpy is already installed.
sqlalchemy is already installed.
geoalchemy2 is already installed.
shapely is already installed.


In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import Column, String, Text, Integer, select, Date, create_engine, inspect
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import logging



In [4]:
user_name = 'l01-4'
#use the file way of storing password or set passw to your password
with open("l01-4_password.txt") as f:
    passw = f.read()

# Calgary Census Demographic Data
## Source: City of Calgary's Open Data Portal
Changelog for Data Export to SQL Table

*From Civic Census by Community, Age and Gender*
- Include YEAR	COMM_CODE	AGE_RANGE	MALES	FEMALES	OTHER
-  check all measure columns and ensure they are integer, remove ',' and convert to INTEGER- rename YEAR to CENSUS_YEAR
- set the CENSUS_YEAR, COMM_CODE, AGE_RANGE as primary keys
- this data seem to have multiple entries per CENSUS_YEAR, COMM_CODE and AGE_RANGE, so before inserting into SQL table, group by primary key and sum the MALES, FEMALES, OTHER counts

    
Link: https://data.calgary.ca/Demographics/Civic-Census-by-Community-Age-and-Gender/vsk6-ghca/about_data


In [19]:
df_census= pd.read_csv("Civic_Census_by_Community__Age_and_Gender_20240318.csv")
#make the column name for Year more meaningful and avoid keyword
df_census.rename(columns={'YEAR': 'CENSUS_YEAR'}, inplace=True)
# Rename columns to uppercase
df_census.columns = df_census.columns.str.upper()# Rename 'year' column to 'CENSUS_YEAR'

df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22872 entries, 0 to 22871
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CENSUS_YEAR  22872 non-null  int64  
 1   COMM_CODE    22872 non-null  object 
 2   AGE_RANGE    22872 non-null  object 
 3   MALES        22872 non-null  int64  
 4   FEMALES      22872 non-null  int64  
 5   OTHER        2592 non-null   float64
dtypes: float64(1), int64(3), object(2)
memory usage: 1.0+ MB


In [20]:
df_census

Unnamed: 0,CENSUS_YEAR,COMM_CODE,AGE_RANGE,MALES,FEMALES,OTHER
0,2019,YKV,75+,0,0,0.0
1,2019,YKV,65-74,0,0,0.0
2,2019,YKV,55-64,0,0,0.0
3,2019,YKV,45-54,0,0,0.0
4,2019,YKV,35-44,0,0,0.0
...,...,...,...,...,...,...
22867,1996,ABB,25-34,2842,0,
22868,1996,ABB,20-24,347,0,
22869,1996,ABB,15-19,462,0,
22870,1996,ABB,5-14,1438,0,


In [7]:
# Define the database connection string
database_url = f"mysql+mysqlconnector://{user_name}:{passw}@datasciencedb.ucalgary.ca/{user_name}"
# Set the global logging level to WARNING
logging.getLogger().setLevel(logging.WARNING)
# Set the logging level for SQLAlchemy to WARNING, from now on no more INFO, CATEGORY
logging.getLogger('sqlalchemy').setLevel(logging.WARNING)
# Create the SQLAlchemy engine
engine = create_engine(database_url, echo=False)  # Set echo to True for debugging

# Test the connection
with engine.connect() as connection:
    result = connection.execute("SELECT 1")
    print(result.scalar())



1


In [32]:
# Define the declarative base
Base = declarative_base()

class CensusDemographics(Base):
    __tablename__ = 'census_demographics'
    
    CENSUS_YEAR = Column(Integer, primary_key=True)
    COMM_CODE = Column(String(255), primary_key=True)
    AGE_RANGE = Column(String(100), primary_key=True)
    MALES = Column(Integer)
    FEMALES = Column(Integer)
    OTHER = Column(Integer)


# Create all tables by issuing CREATE TABLE commands to the database.
Base.metadata.create_all(engine)


In [21]:
# Check for NaN values in the entire DataFrame
nan_check = df_census.isna().any()

# Print columns with NaN values
print("Columns with NaN values:")
print(nan_check[nan_check])
df_census.replace({np.nan: None}, inplace=True)
def convert_to_int(column):
    # Check if the column is already of an integer dtype
    if not pd.api.types.is_numeric_dtype(column):
        # If it's not numeric, assume it's a string that needs processing.
        # Convert to string first to ensure .str methods work, then replace commas and convert to numeric.
        column = pd.to_numeric(column.astype(str).str.replace(',', ''), errors='coerce').fillna(0)
    # Ensure the result is returned as integer
    return column.astype(int)

# Apply this function to each count column in df_census
count_columns = ['MALES','FEMALES','OTHER']

for col in count_columns:
    df_census[col] = convert_to_int(df_census[col])
df_census.info()

Columns with NaN values:
OTHER    True
dtype: bool
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22872 entries, 0 to 22871
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CENSUS_YEAR  22872 non-null  int64 
 1   COMM_CODE    22872 non-null  object
 2   AGE_RANGE    22872 non-null  object
 3   MALES        22872 non-null  int32 
 4   FEMALES      22872 non-null  int32 
 5   OTHER        22872 non-null  int32 
dtypes: int32(3), int64(1), object(2)
memory usage: 804.2+ KB


In [22]:
Session = sessionmaker(bind=engine)
session = Session()

In [42]:
# Group the DataFrame by 'CENSUS_YEAR', 'COMM_CODE', 'AGE_RANGE' and sum the other columns
grouped_df = df_census.groupby(['CENSUS_YEAR', 'COMM_CODE', 'AGE_RANGE']).sum().reset_index()

# Assuming df_census is your pandas DataFrame with the census data
for index, row in grouped_df.iterrows():
    try:
        census_demographic = CensusDemographics(
            CENSUS_YEAR=row['CENSUS_YEAR'],
            COMM_CODE=row['COMM_CODE'],
            AGE_RANGE=row['AGE_RANGE'],
            MALES=row['MALES'],
            FEMALES=row['FEMALES'],
            OTHER=row['OTHER']
        )
        session.add(census_demographic)
        
        if (index + 1) % 1000 == 0:
            # Print progress at every 1000 index
            progress = (index + 1) / len(df_census) * 100
            print(f"Progress: {progress:.2f}%")
            
    except Exception as e:
        print(f"Error in row {index}: {e}")
        print(row)  # Print the entire row for reference
        # You could choose to rollback if you want to skip only the erroneous row
        # session.rollback()  # Rollback the transaction to continue with the next row

# After finishing the loop, commit the transaction to save all changes
session.commit()

Progress: 4.37%
Progress: 8.74%
Progress: 13.12%
Progress: 17.49%
Progress: 21.86%
Progress: 26.23%
Progress: 30.61%
Progress: 34.98%
Progress: 39.35%
Progress: 43.72%
Progress: 48.09%
Progress: 52.47%
Progress: 56.84%
Progress: 61.21%
Progress: 65.58%
Progress: 69.95%
Progress: 74.33%
Progress: 78.70%
Progress: 83.07%
Progress: 87.44%
Progress: 91.82%
Progress: 96.19%


In [43]:
session.close()

In [41]:
session.rollback()

  session.rollback()
